LLVM 20.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm::AMDGPU {
44#define GET_D16ImageDimIntrinsics_IMPL
45#define GET_ImageDimIntrinsicTable_IMPL
46#define GET_RsrcIntrinsics_IMPL
47#include "AMDGPUGenSearchableTables.inc"
48} // namespace llvm::AMDGPU
49
50// Must be at least 4 to be able to branch over minimum unconditional branch
51// code. This is only for making it possible to write reasonably small tests for
52// long branches.
54BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
55 cl::desc("Restrict range of branch instructions (DEBUG)"));
56
58 "amdgpu-fix-16-bit-physreg-copies",
59 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
60 cl::init(true),
62
64 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
65 RI(ST), ST(ST) {
66 SchedModel.init(&ST);
67}
68
69//===----------------------------------------------------------------------===//
70// TargetInstrInfo callbacks
71//===----------------------------------------------------------------------===//
72
73static unsigned getNumOperandsNoGlue(SDNode *Node) {
74 unsigned N = Node->getNumOperands();
75 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
76 --N;
77 return N;
78}
79
80/// Returns true if both nodes have the same value for the given
81/// operand \p Op, or if both nodes do not have this operand.
82static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
83 unsigned Opc0 = N0->getMachineOpcode();
84 unsigned Opc1 = N1->getMachineOpcode();
85
86 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
87 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
88
89 if (Op0Idx == -1 && Op1Idx == -1)
90 return true;
91
92
93 if ((Op0Idx == -1 && Op1Idx != -1) ||
94 (Op1Idx == -1 && Op0Idx != -1))
95 return false;
96
97 // getNamedOperandIdx returns the index for the MachineInstr's operands,
98 // which includes the result as the first operand. We are indexing into the
99 // MachineSDNode's operands, so we need to skip the result operand to get
100 // the real index.
101 --Op0Idx;
102 --Op1Idx;
103
104 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
105}
106
107static bool canRemat(const MachineInstr &MI) {
108
112 return true;
113
114 if (SIInstrInfo::isSMRD(MI)) {
115 return !MI.memoperands_empty() &&
116 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
117 return MMO->isLoad() && MMO->isInvariant();
118 });
119 }
120
121 return false;
122}
123
125 const MachineInstr &MI) const {
126
127 if (canRemat(MI)) {
128 // Normally VALU use of exec would block the rematerialization, but that
129 // is OK in this case to have an implicit exec read as all VALU do.
130 // We really want all of the generic logic for this except for this.
131
132 // Another potential implicit use is mode register. The core logic of
133 // the RA will not attempt rematerialization if mode is set anywhere
134 // in the function, otherwise it is safe since mode is not changed.
135
136 // There is difference to generic method which does not allow
137 // rematerialization if there are virtual register uses. We allow this,
138 // therefore this method includes SOP instructions as well.
139 if (!MI.hasImplicitDef() &&
140 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
141 !MI.mayRaiseFPException())
142 return true;
143 }
144
146}
147
148// Returns true if the scalar result of a VALU instruction depends on exec.
150 // Ignore comparisons which are only used masked with exec.
151 // This allows some hoisting/sinking of VALU comparisons.
152 if (MI.isCompare()) {
153 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
154 Register DstReg = MI.getOperand(0).getReg();
155 if (!DstReg.isVirtual())
156 return true;
157 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
158 switch (Use.getOpcode()) {
159 case AMDGPU::S_AND_SAVEEXEC_B32:
160 case AMDGPU::S_AND_SAVEEXEC_B64:
161 break;
162 case AMDGPU::S_AND_B32:
163 case AMDGPU::S_AND_B64:
164 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
165 return true;
166 break;
167 default:
168 return true;
169 }
170 }
171 return false;
172 }
173
174 switch (MI.getOpcode()) {
175 default:
176 break;
177 case AMDGPU::V_READFIRSTLANE_B32:
178 return true;
179 }
180
181 return false;
182}
183
185 // Any implicit use of exec by VALU is not a real register read.
186 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
188}
189
191 MachineBasicBlock *SuccToSinkTo,
192 MachineCycleInfo *CI) const {
193 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
194 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
195 return true;
196
197 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
198 // Check if sinking of MI would create temporal divergent use.
199 for (auto Op : MI.uses()) {
200 if (Op.isReg() && Op.getReg().isVirtual() &&
201 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
202 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
203
204 // SgprDef defined inside cycle
205 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
206 if (FromCycle == nullptr)
207 continue;
208
209 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
210 // Check if there is a FromCycle that contains SgprDef's basic block but
211 // does not contain SuccToSinkTo and also has divergent exit condition.
212 while (FromCycle && !FromCycle->contains(ToCycle)) {
214 FromCycle->getExitingBlocks(ExitingBlocks);
215
216 // FromCycle has divergent exit condition.
217 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
218 if (hasDivergentBranch(ExitingBlock))
219 return false;
220 }
221
222 FromCycle = FromCycle->getParentCycle();
223 }
224 }
225 }
226
227 return true;
228}
229
231 int64_t &Offset0,
232 int64_t &Offset1) const {
233 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
234 return false;
235
236 unsigned Opc0 = Load0->getMachineOpcode();
237 unsigned Opc1 = Load1->getMachineOpcode();
238
239 // Make sure both are actually loads.
240 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
241 return false;
242
243 // A mayLoad instruction without a def is not a load. Likely a prefetch.
244 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
245 return false;
246
247 if (isDS(Opc0) && isDS(Opc1)) {
248
249 // FIXME: Handle this case:
250 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
251 return false;
252
253 // Check base reg.
254 if (Load0->getOperand(0) != Load1->getOperand(0))
255 return false;
256
257 // Skip read2 / write2 variants for simplicity.
258 // TODO: We should report true if the used offsets are adjacent (excluded
259 // st64 versions).
260 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
261 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
262 if (Offset0Idx == -1 || Offset1Idx == -1)
263 return false;
264
265 // XXX - be careful of dataless loads
266 // getNamedOperandIdx returns the index for MachineInstrs. Since they
267 // include the output in the operand list, but SDNodes don't, we need to
268 // subtract the index by one.
269 Offset0Idx -= get(Opc0).NumDefs;
270 Offset1Idx -= get(Opc1).NumDefs;
271 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
272 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
273 return true;
274 }
275
276 if (isSMRD(Opc0) && isSMRD(Opc1)) {
277 // Skip time and cache invalidation instructions.
278 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
279 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
280 return false;
281
282 unsigned NumOps = getNumOperandsNoGlue(Load0);
283 if (NumOps != getNumOperandsNoGlue(Load1))
284 return false;
285
286 // Check base reg.
287 if (Load0->getOperand(0) != Load1->getOperand(0))
288 return false;
289
290 // Match register offsets, if both register and immediate offsets present.
291 assert(NumOps == 4 || NumOps == 5);
292 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
293 return false;
294
295 const ConstantSDNode *Load0Offset =
296 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
297 const ConstantSDNode *Load1Offset =
298 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
299
300 if (!Load0Offset || !Load1Offset)
301 return false;
302
303 Offset0 = Load0Offset->getZExtValue();
304 Offset1 = Load1Offset->getZExtValue();
305 return true;
306 }
307
308 // MUBUF and MTBUF can access the same addresses.
309 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
310
311 // MUBUF and MTBUF have vaddr at different indices.
312 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
313 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
314 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
315 return false;
316
317 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
318 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
319
320 if (OffIdx0 == -1 || OffIdx1 == -1)
321 return false;
322
323 // getNamedOperandIdx returns the index for MachineInstrs. Since they
324 // include the output in the operand list, but SDNodes don't, we need to
325 // subtract the index by one.
326 OffIdx0 -= get(Opc0).NumDefs;
327 OffIdx1 -= get(Opc1).NumDefs;
328
329 SDValue Off0 = Load0->getOperand(OffIdx0);
330 SDValue Off1 = Load1->getOperand(OffIdx1);
331
332 // The offset might be a FrameIndexSDNode.
333 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
334 return false;
335
336 Offset0 = Off0->getAsZExtVal();
337 Offset1 = Off1->getAsZExtVal();
338 return true;
339 }
340
341 return false;
342}
343
344static bool isStride64(unsigned Opc) {
345 switch (Opc) {
346 case AMDGPU::DS_READ2ST64_B32:
347 case AMDGPU::DS_READ2ST64_B64:
348 case AMDGPU::DS_WRITE2ST64_B32:
349 case AMDGPU::DS_WRITE2ST64_B64:
350 return true;
351 default:
352 return false;
353 }
354}
355
358 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
359 const TargetRegisterInfo *TRI) const {
360 if (!LdSt.mayLoadOrStore())
361 return false;
362
363 unsigned Opc = LdSt.getOpcode();
364 OffsetIsScalable = false;
365 const MachineOperand *BaseOp, *OffsetOp;
366 int DataOpIdx;
367
368 if (isDS(LdSt)) {
369 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
370 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
371 if (OffsetOp) {
372 // Normal, single offset LDS instruction.
373 if (!BaseOp) {
374 // DS_CONSUME/DS_APPEND use M0 for the base address.
375 // TODO: find the implicit use operand for M0 and use that as BaseOp?
376 return false;
377 }
378 BaseOps.push_back(BaseOp);
379 Offset = OffsetOp->getImm();
380 // Get appropriate operand, and compute width accordingly.
381 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
382 if (DataOpIdx == -1)
383 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
384 Width = getOpSize(LdSt, DataOpIdx);
385 } else {
386 // The 2 offset instructions use offset0 and offset1 instead. We can treat
387 // these as a load with a single offset if the 2 offsets are consecutive.
388 // We will use this for some partially aligned loads.
389 const MachineOperand *Offset0Op =
390 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
391 const MachineOperand *Offset1Op =
392 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
393
394 unsigned Offset0 = Offset0Op->getImm() & 0xff;
395 unsigned Offset1 = Offset1Op->getImm() & 0xff;
396 if (Offset0 + 1 != Offset1)
397 return false;
398
399 // Each of these offsets is in element sized units, so we need to convert
400 // to bytes of the individual reads.
401
402 unsigned EltSize;
403 if (LdSt.mayLoad())
404 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
405 else {
406 assert(LdSt.mayStore());
407 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
408 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
409 }
410
411 if (isStride64(Opc))
412 EltSize *= 64;
413
414 BaseOps.push_back(BaseOp);
415 Offset = EltSize * Offset0;
416 // Get appropriate operand(s), and compute width accordingly.
417 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
418 if (DataOpIdx == -1) {
419 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
420 Width = getOpSize(LdSt, DataOpIdx);
421 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
422 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
423 } else {
424 Width = getOpSize(LdSt, DataOpIdx);
425 }
426 }
427 return true;
428 }
429
430 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
431 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
432 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
433 return false;
434 BaseOps.push_back(RSrc);
435 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
436 if (BaseOp && !BaseOp->isFI())
437 BaseOps.push_back(BaseOp);
438 const MachineOperand *OffsetImm =
439 getNamedOperand(LdSt, AMDGPU::OpName::offset);
440 Offset = OffsetImm->getImm();
441 const MachineOperand *SOffset =
442 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
443 if (SOffset) {
444 if (SOffset->isReg())
445 BaseOps.push_back(SOffset);
446 else
447 Offset += SOffset->getImm();
448 }
449 // Get appropriate operand, and compute width accordingly.
450 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
451 if (DataOpIdx == -1)
452 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
453 if (DataOpIdx == -1) // LDS DMA
454 return false;
455 Width = getOpSize(LdSt, DataOpIdx);
456 return true;
457 }
458
459 if (isImage(LdSt)) {
460 auto RsrcOpName =
461 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
462 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
463 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
464 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
465 if (VAddr0Idx >= 0) {
466 // GFX10 possible NSA encoding.
467 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
468 BaseOps.push_back(&LdSt.getOperand(I));
469 } else {
470 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
471 }
472 Offset = 0;
473 // Get appropriate operand, and compute width accordingly.
474 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
475 if (DataOpIdx == -1)
476 return false; // no return sampler
477 Width = getOpSize(LdSt, DataOpIdx);
478 return true;
479 }
480
481 if (isSMRD(LdSt)) {
482 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
483 if (!BaseOp) // e.g. S_MEMTIME
484 return false;
485 BaseOps.push_back(BaseOp);
486 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
487 Offset = OffsetOp ? OffsetOp->getImm() : 0;
488 // Get appropriate operand, and compute width accordingly.
489 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
490 if (DataOpIdx == -1)
491 return false;
492 Width = getOpSize(LdSt, DataOpIdx);
493 return true;
494 }
495
496 if (isFLAT(LdSt)) {
497 // Instructions have either vaddr or saddr or both or none.
498 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
499 if (BaseOp)
500 BaseOps.push_back(BaseOp);
501 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
502 if (BaseOp)
503 BaseOps.push_back(BaseOp);
504 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
505 // Get appropriate operand, and compute width accordingly.
506 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
507 if (DataOpIdx == -1)
508 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
509 if (DataOpIdx == -1) // LDS DMA
510 return false;
511 Width = getOpSize(LdSt, DataOpIdx);
512 return true;
513 }
514
515 return false;
516}
517
518static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
520 const MachineInstr &MI2,
522 // Only examine the first "base" operand of each instruction, on the
523 // assumption that it represents the real base address of the memory access.
524 // Other operands are typically offsets or indices from this base address.
525 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
526 return true;
527
528 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
529 return false;
530
531 auto MO1 = *MI1.memoperands_begin();
532 auto MO2 = *MI2.memoperands_begin();
533 if (MO1->getAddrSpace() != MO2->getAddrSpace())
534 return false;
535
536 auto Base1 = MO1->getValue();
537 auto Base2 = MO2->getValue();
538 if (!Base1 || !Base2)
539 return false;
540 Base1 = getUnderlyingObject(Base1);
541 Base2 = getUnderlyingObject(Base2);
542
543 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
544 return false;
545
546 return Base1 == Base2;
547}
548
550 int64_t Offset1, bool OffsetIsScalable1,
552 int64_t Offset2, bool OffsetIsScalable2,
553 unsigned ClusterSize,
554 unsigned NumBytes) const {
555 // If the mem ops (to be clustered) do not have the same base ptr, then they
556 // should not be clustered
557 if (!BaseOps1.empty() && !BaseOps2.empty()) {
558 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
559 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
560 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
561 return false;
562 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
563 // If only one base op is empty, they do not have the same base ptr
564 return false;
565 }
566
567 // In order to avoid register pressure, on an average, the number of DWORDS
568 // loaded together by all clustered mem ops should not exceed 8. This is an
569 // empirical value based on certain observations and performance related
570 // experiments.
571 // The good thing about this heuristic is - it avoids clustering of too many
572 // sub-word loads, and also avoids clustering of wide loads. Below is the
573 // brief summary of how the heuristic behaves for various `LoadSize`.
574 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
575 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
576 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
577 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
578 // (5) LoadSize >= 17: do not cluster
579 const unsigned LoadSize = NumBytes / ClusterSize;
580 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
581 return NumDWORDs <= 8;
582}
583
584// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
585// the first 16 loads will be interleaved with the stores, and the next 16 will
586// be clustered as expected. It should really split into 2 16 store batches.
587//
588// Loads are clustered until this returns false, rather than trying to schedule
589// groups of stores. This also means we have to deal with saying different
590// address space loads should be clustered, and ones which might cause bank
591// conflicts.
592//
593// This might be deprecated so it might not be worth that much effort to fix.
595 int64_t Offset0, int64_t Offset1,
596 unsigned NumLoads) const {
597 assert(Offset1 > Offset0 &&
598 "Second offset should be larger than first offset!");
599 // If we have less than 16 loads in a row, and the offsets are within 64
600 // bytes, then schedule together.
601
602 // A cacheline is 64 bytes (for global memory).
603 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
604}
605
608 const DebugLoc &DL, MCRegister DestReg,
609 MCRegister SrcReg, bool KillSrc,
610 const char *Msg = "illegal VGPR to SGPR copy") {
612 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
614 C.diagnose(IllegalCopy);
615
616 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
617 .addReg(SrcReg, getKillRegState(KillSrc));
618}
619
620/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
621/// possible to have a direct copy in these cases on GFX908, so an intermediate
622/// VGPR copy is required.
626 const DebugLoc &DL, MCRegister DestReg,
627 MCRegister SrcReg, bool KillSrc,
628 RegScavenger &RS, bool RegsOverlap,
629 Register ImpDefSuperReg = Register(),
630 Register ImpUseSuperReg = Register()) {
631 assert((TII.getSubtarget().hasMAIInsts() &&
632 !TII.getSubtarget().hasGFX90AInsts()) &&
633 "Expected GFX908 subtarget.");
634
635 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
636 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
637 "Source register of the copy should be either an SGPR or an AGPR.");
638
639 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
640 "Destination register of the copy should be an AGPR.");
641
642 const SIRegisterInfo &RI = TII.getRegisterInfo();
643
644 // First try to find defining accvgpr_write to avoid temporary registers.
645 // In the case of copies of overlapping AGPRs, we conservatively do not
646 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
647 // an accvgpr_write used for this same copy due to implicit-defs
648 if (!RegsOverlap) {
649 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
650 --Def;
651
652 if (!Def->modifiesRegister(SrcReg, &RI))
653 continue;
654
655 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
656 Def->getOperand(0).getReg() != SrcReg)
657 break;
658
659 MachineOperand &DefOp = Def->getOperand(1);
660 assert(DefOp.isReg() || DefOp.isImm());
661
662 if (DefOp.isReg()) {
663 bool SafeToPropagate = true;
664 // Check that register source operand is not clobbered before MI.
665 // Immediate operands are always safe to propagate.
666 for (auto I = Def; I != MI && SafeToPropagate; ++I)
667 if (I->modifiesRegister(DefOp.getReg(), &RI))
668 SafeToPropagate = false;
669
670 if (!SafeToPropagate)
671 break;
672
673 DefOp.setIsKill(false);
674 }
675
676 MachineInstrBuilder Builder =
677 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
678 .add(DefOp);
679 if (ImpDefSuperReg)
680 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
681
682 if (ImpUseSuperReg) {
683 Builder.addReg(ImpUseSuperReg,
685 }
686
687 return;
688 }
689 }
690
692 RS.backward(std::next(MI));
693
694 // Ideally we want to have three registers for a long reg_sequence copy
695 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
696 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
697 *MBB.getParent());
698
699 // Registers in the sequence are allocated contiguously so we can just
700 // use register number to pick one of three round-robin temps.
701 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
702 Register Tmp =
703 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
705 "VGPR used for an intermediate copy should have been reserved.");
706
707 // Only loop through if there are any free registers left. We don't want to
708 // spill.
709 while (RegNo--) {
710 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
711 /* RestoreAfter */ false, 0,
712 /* AllowSpill */ false);
713 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
714 break;
715 Tmp = Tmp2;
716 RS.setRegUsed(Tmp);
717 }
718
719 // Insert copy to temporary VGPR.
720 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
721 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
722 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
723 } else {
724 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
725 }
726
727 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
728 .addReg(SrcReg, getKillRegState(KillSrc));
729 if (ImpUseSuperReg) {
730 UseBuilder.addReg(ImpUseSuperReg,
732 }
733
734 MachineInstrBuilder DefBuilder
735 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
736 .addReg(Tmp, RegState::Kill);
737
738 if (ImpDefSuperReg)
739 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
740}
741
744 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
745 const TargetRegisterClass *RC, bool Forward) {
746 const SIRegisterInfo &RI = TII.getRegisterInfo();
747 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
749 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
750
751 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
752 int16_t SubIdx = BaseIndices[Idx];
753 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
754 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
755 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
756 unsigned Opcode = AMDGPU::S_MOV_B32;
757
758 // Is SGPR aligned? If so try to combine with next.
759 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
760 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
761 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
762 // Can use SGPR64 copy
763 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
764 SubIdx = RI.getSubRegFromChannel(Channel, 2);
765 DestSubReg = RI.getSubReg(DestReg, SubIdx);
766 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
767 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
768 Opcode = AMDGPU::S_MOV_B64;
769 Idx++;
770 }
771
772 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
773 .addReg(SrcSubReg)
774 .addReg(SrcReg, RegState::Implicit);
775
776 if (!FirstMI)
777 FirstMI = LastMI;
778
779 if (!Forward)
780 I--;
781 }
782
783 assert(FirstMI && LastMI);
784 if (!Forward)
785 std::swap(FirstMI, LastMI);
786
787 FirstMI->addOperand(
788 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
789
790 if (KillSrc)
791 LastMI->addRegisterKilled(SrcReg, &RI);
792}
793
796 const DebugLoc &DL, MCRegister DestReg,
797 MCRegister SrcReg, bool KillSrc) const {
798 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
799 unsigned Size = RI.getRegSizeInBits(*RC);
800 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
801 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
802
803 // The rest of copyPhysReg assumes Src and Dst size are the same size.
804 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
805 // we remove Fix16BitCopies and this code block?
806 if (Fix16BitCopies) {
807 if (((Size == 16) != (SrcSize == 16))) {
808 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
810 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
811 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
812 RegToFix = SubReg;
813
814 if (DestReg == SrcReg) {
815 // Identity copy. Insert empty bundle since ExpandPostRA expects an
816 // instruction here.
817 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
818 return;
819 }
820 RC = RI.getPhysRegBaseClass(DestReg);
821 Size = RI.getRegSizeInBits(*RC);
822 SrcRC = RI.getPhysRegBaseClass(SrcReg);
823 SrcSize = RI.getRegSizeInBits(*SrcRC);
824 }
825 }
826
827 if (RC == &AMDGPU::VGPR_32RegClass) {
828 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
829 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
830 AMDGPU::AGPR_32RegClass.contains(SrcReg));
831 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
832 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
833 BuildMI(MBB, MI, DL, get(Opc), DestReg)
834 .addReg(SrcReg, getKillRegState(KillSrc));
835 return;
836 }
837
838 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
839 RC == &AMDGPU::SReg_32RegClass) {
840 if (SrcReg == AMDGPU::SCC) {
841 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
842 .addImm(1)
843 .addImm(0);
844 return;
845 }
846
847 if (DestReg == AMDGPU::VCC_LO) {
848 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
849 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
850 .addReg(SrcReg, getKillRegState(KillSrc));
851 } else {
852 // FIXME: Hack until VReg_1 removed.
853 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
854 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
855 .addImm(0)
856 .addReg(SrcReg, getKillRegState(KillSrc));
857 }
858
859 return;
860 }
861
862 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
863 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
864 return;
865 }
866
867 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
868 .addReg(SrcReg, getKillRegState(KillSrc));
869 return;
870 }
871
872 if (RC == &AMDGPU::SReg_64RegClass) {
873 if (SrcReg == AMDGPU::SCC) {
874 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
875 .addImm(1)
876 .addImm(0);
877 return;
878 }
879
880 if (DestReg == AMDGPU::VCC) {
881 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
882 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
883 .addReg(SrcReg, getKillRegState(KillSrc));
884 } else {
885 // FIXME: Hack until VReg_1 removed.
886 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
887 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
888 .addImm(0)
889 .addReg(SrcReg, getKillRegState(KillSrc));
890 }
891
892 return;
893 }
894
895 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
896 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
897 return;
898 }
899
900 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
901 .addReg(SrcReg, getKillRegState(KillSrc));
902 return;
903 }
904
905 if (DestReg == AMDGPU::SCC) {
906 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
907 // but SelectionDAG emits such copies for i1 sources.
908 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
909 // This copy can only be produced by patterns
910 // with explicit SCC, which are known to be enabled
911 // only for subtargets with S_CMP_LG_U64 present.
913 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
914 .addReg(SrcReg, getKillRegState(KillSrc))
915 .addImm(0);
916 } else {
917 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
918 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
919 .addReg(SrcReg, getKillRegState(KillSrc))
920 .addImm(0);
921 }
922
923 return;
924 }
925
926 if (RC == &AMDGPU::AGPR_32RegClass) {
927 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
928 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
929 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
930 .addReg(SrcReg, getKillRegState(KillSrc));
931 return;
932 }
933
934 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
935 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
936 .addReg(SrcReg, getKillRegState(KillSrc));
937 return;
938 }
939
940 // FIXME: Pass should maintain scavenger to avoid scan through the block on
941 // every AGPR spill.
942 RegScavenger RS;
943 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
944 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
945 return;
946 }
947
948 if (Size == 16) {
949 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
950 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
951 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
952
953 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
954 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
955 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
956 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
957 bool DstLow = !AMDGPU::isHi16Reg(DestReg, RI);
958 bool SrcLow = !AMDGPU::isHi16Reg(SrcReg, RI);
959 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
960 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
961
962 if (IsSGPRDst) {
963 if (!IsSGPRSrc) {
964 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
965 return;
966 }
967
968 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
969 .addReg(NewSrcReg, getKillRegState(KillSrc));
970 return;
971 }
972
973 if (IsAGPRDst || IsAGPRSrc) {
974 if (!DstLow || !SrcLow) {
975 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
976 "Cannot use hi16 subreg with an AGPR!");
977 }
978
979 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
980 return;
981 }
982
983 if (ST.hasTrue16BitInsts()) {
984 if (IsSGPRSrc) {
985 assert(SrcLow);
986 SrcReg = NewSrcReg;
987 }
988 // Use the smaller instruction encoding if possible.
989 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
990 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
991 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
992 .addReg(SrcReg);
993 } else {
994 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
995 .addImm(0) // src0_modifiers
996 .addReg(SrcReg)
997 .addImm(0); // op_sel
998 }
999 return;
1000 }
1001
1002 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1003 if (!DstLow || !SrcLow) {
1004 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1005 "Cannot use hi16 subreg on VI!");
1006 }
1007
1008 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1009 .addReg(NewSrcReg, getKillRegState(KillSrc));
1010 return;
1011 }
1012
1013 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1014 .addImm(0) // src0_modifiers
1015 .addReg(NewSrcReg)
1016 .addImm(0) // clamp
1023 // First implicit operand is $exec.
1024 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1025 return;
1026 }
1027
1028 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1029 if (ST.hasMovB64()) {
1030 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1031 .addReg(SrcReg, getKillRegState(KillSrc));
1032 return;
1033 }
1034 if (ST.hasPkMovB32()) {
1035 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1037 .addReg(SrcReg)
1039 .addReg(SrcReg)
1040 .addImm(0) // op_sel_lo
1041 .addImm(0) // op_sel_hi
1042 .addImm(0) // neg_lo
1043 .addImm(0) // neg_hi
1044 .addImm(0) // clamp
1045 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1046 return;
1047 }
1048 }
1049
1050 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1051 if (RI.isSGPRClass(RC)) {
1052 if (!RI.isSGPRClass(SrcRC)) {
1053 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1054 return;
1055 }
1056 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1057 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1058 Forward);
1059 return;
1060 }
1061
1062 unsigned EltSize = 4;
1063 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1064 if (RI.isAGPRClass(RC)) {
1065 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1066 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1067 else if (RI.hasVGPRs(SrcRC) ||
1068 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1069 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1070 else
1071 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1072 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1073 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1074 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1075 (RI.isProperlyAlignedRC(*RC) &&
1076 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1077 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1078 if (ST.hasMovB64()) {
1079 Opcode = AMDGPU::V_MOV_B64_e32;
1080 EltSize = 8;
1081 } else if (ST.hasPkMovB32()) {
1082 Opcode = AMDGPU::V_PK_MOV_B32;
1083 EltSize = 8;
1084 }
1085 }
1086
1087 // For the cases where we need an intermediate instruction/temporary register
1088 // (destination is an AGPR), we need a scavenger.
1089 //
1090 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1091 // whole block for every handled copy.
1092 std::unique_ptr<RegScavenger> RS;
1093 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1094 RS = std::make_unique<RegScavenger>();
1095
1096 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1097
1098 // If there is an overlap, we can't kill the super-register on the last
1099 // instruction, since it will also kill the components made live by this def.
1100 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1101 const bool CanKillSuperReg = KillSrc && !Overlap;
1102
1103 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1104 unsigned SubIdx;
1105 if (Forward)
1106 SubIdx = SubIndices[Idx];
1107 else
1108 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1109 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1110 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1111 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1112
1113 bool IsFirstSubreg = Idx == 0;
1114 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1115
1116 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1117 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1118 Register ImpUseSuper = SrcReg;
1119 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1120 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1121 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1123 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1125 .addReg(SrcSubReg)
1127 .addReg(SrcSubReg)
1128 .addImm(0) // op_sel_lo
1129 .addImm(0) // op_sel_hi
1130 .addImm(0) // neg_lo
1131 .addImm(0) // neg_hi
1132 .addImm(0) // clamp
1133 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1134 if (IsFirstSubreg)
1136 } else {
1137 MachineInstrBuilder Builder =
1138 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1139 if (IsFirstSubreg)
1140 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1141
1142 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1143 }
1144 }
1145}
1146
1147int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1148 int NewOpc;
1149
1150 // Try to map original to commuted opcode
1151 NewOpc = AMDGPU::getCommuteRev(Opcode);
1152 if (NewOpc != -1)
1153 // Check if the commuted (REV) opcode exists on the target.
1154 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1155
1156 // Try to map commuted to original opcode
1157 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1158 if (NewOpc != -1)
1159 // Check if the original (non-REV) opcode exists on the target.
1160 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1161
1162 return Opcode;
1163}
1164
1167 const DebugLoc &DL, Register DestReg,
1168 int64_t Value) const {
1170 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1171 if (RegClass == &AMDGPU::SReg_32RegClass ||
1172 RegClass == &AMDGPU::SGPR_32RegClass ||
1173 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1174 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1175 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1176 .addImm(Value);
1177 return;
1178 }
1179
1180 if (RegClass == &AMDGPU::SReg_64RegClass ||
1181 RegClass == &AMDGPU::SGPR_64RegClass ||
1182 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1183 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1184 .addImm(Value);
1185 return;
1186 }
1187
1188 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1189 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1190 .addImm(Value);
1191 return;
1192 }
1193 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1194 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1195 .addImm(Value);
1196 return;
1197 }
1198
1199 unsigned EltSize = 4;
1200 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1201 if (RI.isSGPRClass(RegClass)) {
1202 if (RI.getRegSizeInBits(*RegClass) > 32) {
1203 Opcode = AMDGPU::S_MOV_B64;
1204 EltSize = 8;
1205 } else {
1206 Opcode = AMDGPU::S_MOV_B32;
1207 EltSize = 4;
1208 }
1209 }
1210
1211 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1212 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1213 int64_t IdxValue = Idx == 0 ? Value : 0;
1214
1215 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1216 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1217 Builder.addImm(IdxValue);
1218 }
1219}
1220
1221const TargetRegisterClass *
1223 return &AMDGPU::VGPR_32RegClass;
1224}
1225
1228 const DebugLoc &DL, Register DstReg,
1230 Register TrueReg,
1231 Register FalseReg) const {
1233 const TargetRegisterClass *BoolXExecRC =
1234 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1235 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1236 "Not a VGPR32 reg");
1237
1238 if (Cond.size() == 1) {
1239 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1240 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1241 .add(Cond[0]);
1242 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1243 .addImm(0)
1244 .addReg(FalseReg)
1245 .addImm(0)
1246 .addReg(TrueReg)
1247 .addReg(SReg);
1248 } else if (Cond.size() == 2) {
1249 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1250 switch (Cond[0].getImm()) {
1251 case SIInstrInfo::SCC_TRUE: {
1252 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1253 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1254 : AMDGPU::S_CSELECT_B64), SReg)
1255 .addImm(1)
1256 .addImm(0);
1257 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1258 .addImm(0)
1259 .addReg(FalseReg)
1260 .addImm(0)
1261 .addReg(TrueReg)
1262 .addReg(SReg);
1263 break;
1264 }
1265 case SIInstrInfo::SCC_FALSE: {
1266 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1267 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1268 : AMDGPU::S_CSELECT_B64), SReg)
1269 .addImm(0)
1270 .addImm(1);
1271 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1272 .addImm(0)
1273 .addReg(FalseReg)
1274 .addImm(0)
1275 .addReg(TrueReg)
1276 .addReg(SReg);
1277 break;
1278 }
1279 case SIInstrInfo::VCCNZ: {
1280 MachineOperand RegOp = Cond[1];
1281 RegOp.setImplicit(false);
1282 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1283 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1284 .add(RegOp);
1285 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1286 .addImm(0)
1287 .addReg(FalseReg)
1288 .addImm(0)
1289 .addReg(TrueReg)
1290 .addReg(SReg);
1291 break;
1292 }
1293 case SIInstrInfo::VCCZ: {
1294 MachineOperand RegOp = Cond[1];
1295 RegOp.setImplicit(false);
1296 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1297 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1298 .add(RegOp);
1299 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1300 .addImm(0)
1301 .addReg(TrueReg)
1302 .addImm(0)
1303 .addReg(FalseReg)
1304 .addReg(SReg);
1305 break;
1306 }
1307 case SIInstrInfo::EXECNZ: {
1308 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1309 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1310 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1311 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1312 .addImm(0);
1313 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1314 : AMDGPU::S_CSELECT_B64), SReg)
1315 .addImm(1)
1316 .addImm(0);
1317 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1318 .addImm(0)
1319 .addReg(FalseReg)
1320 .addImm(0)
1321 .addReg(TrueReg)
1322 .addReg(SReg);
1323 break;
1324 }
1325 case SIInstrInfo::EXECZ: {
1326 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1327 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1328 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1329 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1330 .addImm(0);
1331 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1332 : AMDGPU::S_CSELECT_B64), SReg)
1333 .addImm(0)
1334 .addImm(1);
1335 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1336 .addImm(0)
1337 .addReg(FalseReg)
1338 .addImm(0)
1339 .addReg(TrueReg)
1340 .addReg(SReg);
1341 llvm_unreachable("Unhandled branch predicate EXECZ");
1342 break;
1343 }
1344 default:
1345 llvm_unreachable("invalid branch predicate");
1346 }
1347 } else {
1348 llvm_unreachable("Can only handle Cond size 1 or 2");
1349 }
1350}
1351
1354 const DebugLoc &DL,
1355 Register SrcReg, int Value) const {
1357 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1358 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1359 .addImm(Value)
1360 .addReg(SrcReg);
1361
1362 return Reg;
1363}
1364
1367 const DebugLoc &DL,
1368 Register SrcReg, int Value) const {
1370 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1371 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1372 .addImm(Value)
1373 .addReg(SrcReg);
1374
1375 return Reg;
1376}
1377
1379
1380 if (RI.isAGPRClass(DstRC))
1381 return AMDGPU::COPY;
1382 if (RI.getRegSizeInBits(*DstRC) == 16) {
1383 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1384 // before RA.
1385 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1386 }
1387 if (RI.getRegSizeInBits(*DstRC) == 32)
1388 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1389 if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC))
1390 return AMDGPU::S_MOV_B64;
1391 if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC))
1392 return AMDGPU::V_MOV_B64_PSEUDO;
1393 return AMDGPU::COPY;
1394}
1395
1396const MCInstrDesc &
1398 bool IsIndirectSrc) const {
1399 if (IsIndirectSrc) {
1400 if (VecSize <= 32) // 4 bytes
1401 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1402 if (VecSize <= 64) // 8 bytes
1403 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1404 if (VecSize <= 96) // 12 bytes
1405 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1406 if (VecSize <= 128) // 16 bytes
1407 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1408 if (VecSize <= 160) // 20 bytes
1409 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1410 if (VecSize <= 256) // 32 bytes
1411 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1412 if (VecSize <= 288) // 36 bytes
1413 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1414 if (VecSize <= 320) // 40 bytes
1415 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1416 if (VecSize <= 352) // 44 bytes
1417 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1418 if (VecSize <= 384) // 48 bytes
1419 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1420 if (VecSize <= 512) // 64 bytes
1421 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1422 if (VecSize <= 1024) // 128 bytes
1423 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1424
1425 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1426 }
1427
1428 if (VecSize <= 32) // 4 bytes
1429 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1430 if (VecSize <= 64) // 8 bytes
1431 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1432 if (VecSize <= 96) // 12 bytes
1433 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1434 if (VecSize <= 128) // 16 bytes
1435 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1436 if (VecSize <= 160) // 20 bytes
1437 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1438 if (VecSize <= 256) // 32 bytes
1439 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1440 if (VecSize <= 288) // 36 bytes
1441 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1442 if (VecSize <= 320) // 40 bytes
1443 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1444 if (VecSize <= 352) // 44 bytes
1445 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1446 if (VecSize <= 384) // 48 bytes
1447 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1448 if (VecSize <= 512) // 64 bytes
1449 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1450 if (VecSize <= 1024) // 128 bytes
1451 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1452
1453 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1454}
1455
1456static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1457 if (VecSize <= 32) // 4 bytes
1458 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1459 if (VecSize <= 64) // 8 bytes
1460 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1461 if (VecSize <= 96) // 12 bytes
1462 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1463 if (VecSize <= 128) // 16 bytes
1464 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1465 if (VecSize <= 160) // 20 bytes
1466 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1467 if (VecSize <= 256) // 32 bytes
1468 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1469 if (VecSize <= 288) // 36 bytes
1470 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1471 if (VecSize <= 320) // 40 bytes
1472 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1473 if (VecSize <= 352) // 44 bytes
1474 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1475 if (VecSize <= 384) // 48 bytes
1476 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1477 if (VecSize <= 512) // 64 bytes
1478 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1479 if (VecSize <= 1024) // 128 bytes
1480 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1481
1482 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1483}
1484
1485static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1486 if (VecSize <= 32) // 4 bytes
1487 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1488 if (VecSize <= 64) // 8 bytes
1489 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1490 if (VecSize <= 96) // 12 bytes
1491 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1492 if (VecSize <= 128) // 16 bytes
1493 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1494 if (VecSize <= 160) // 20 bytes
1495 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1496 if (VecSize <= 256) // 32 bytes
1497 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1498 if (VecSize <= 288) // 36 bytes
1499 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1500 if (VecSize <= 320) // 40 bytes
1501 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1502 if (VecSize <= 352) // 44 bytes
1503 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1504 if (VecSize <= 384) // 48 bytes
1505 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1506 if (VecSize <= 512) // 64 bytes
1507 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1508 if (VecSize <= 1024) // 128 bytes
1509 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1510
1511 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1512}
1513
1514static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1515 if (VecSize <= 64) // 8 bytes
1516 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1517 if (VecSize <= 128) // 16 bytes
1518 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1519 if (VecSize <= 256) // 32 bytes
1520 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1521 if (VecSize <= 512) // 64 bytes
1522 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1523 if (VecSize <= 1024) // 128 bytes
1524 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1525
1526 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1527}
1528
1529const MCInstrDesc &
1530SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1531 bool IsSGPR) const {
1532 if (IsSGPR) {
1533 switch (EltSize) {
1534 case 32:
1535 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1536 case 64:
1537 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1538 default:
1539 llvm_unreachable("invalid reg indexing elt size");
1540 }
1541 }
1542
1543 assert(EltSize == 32 && "invalid reg indexing elt size");
1545}
1546
1547static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1548 switch (Size) {
1549 case 4:
1550 return AMDGPU::SI_SPILL_S32_SAVE;
1551 case 8:
1552 return AMDGPU::SI_SPILL_S64_SAVE;
1553 case 12:
1554 return AMDGPU::SI_SPILL_S96_SAVE;
1555 case 16:
1556 return AMDGPU::SI_SPILL_S128_SAVE;
1557 case 20:
1558 return AMDGPU::SI_SPILL_S160_SAVE;
1559 case 24:
1560 return AMDGPU::SI_SPILL_S192_SAVE;
1561 case 28:
1562 return AMDGPU::SI_SPILL_S224_SAVE;
1563 case 32:
1564 return AMDGPU::SI_SPILL_S256_SAVE;
1565 case 36:
1566 return AMDGPU::SI_SPILL_S288_SAVE;
1567 case 40:
1568 return AMDGPU::SI_SPILL_S320_SAVE;
1569 case 44:
1570 return AMDGPU::SI_SPILL_S352_SAVE;
1571 case 48:
1572 return AMDGPU::SI_SPILL_S384_SAVE;
1573 case 64:
1574 return AMDGPU::SI_SPILL_S512_SAVE;
1575 case 128:
1576 return AMDGPU::SI_SPILL_S1024_SAVE;
1577 default:
1578 llvm_unreachable("unknown register size");
1579 }
1580}
1581
1582static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1583 switch (Size) {
1584 case 4:
1585 return AMDGPU::SI_SPILL_V32_SAVE;
1586 case 8:
1587 return AMDGPU::SI_SPILL_V64_SAVE;
1588 case 12:
1589 return AMDGPU::SI_SPILL_V96_SAVE;
1590 case 16:
1591 return AMDGPU::SI_SPILL_V128_SAVE;
1592 case 20:
1593 return AMDGPU::SI_SPILL_V160_SAVE;
1594 case 24:
1595 return AMDGPU::SI_SPILL_V192_SAVE;
1596 case 28:
1597 return AMDGPU::SI_SPILL_V224_SAVE;
1598 case 32:
1599 return AMDGPU::SI_SPILL_V256_SAVE;
1600 case 36:
1601 return AMDGPU::SI_SPILL_V288_SAVE;
1602 case 40:
1603 return AMDGPU::SI_SPILL_V320_SAVE;
1604 case 44:
1605 return AMDGPU::SI_SPILL_V352_SAVE;
1606 case 48:
1607 return AMDGPU::SI_SPILL_V384_SAVE;
1608 case 64:
1609 return AMDGPU::SI_SPILL_V512_SAVE;
1610 case 128:
1611 return AMDGPU::SI_SPILL_V1024_SAVE;
1612 default:
1613 llvm_unreachable("unknown register size");
1614 }
1615}
1616
1617static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1618 switch (Size) {
1619 case 4:
1620 return AMDGPU::SI_SPILL_A32_SAVE;
1621 case 8:
1622 return AMDGPU::SI_SPILL_A64_SAVE;
1623 case 12:
1624 return AMDGPU::SI_SPILL_A96_SAVE;
1625 case 16:
1626 return AMDGPU::SI_SPILL_A128_SAVE;
1627 case 20:
1628 return AMDGPU::SI_SPILL_A160_SAVE;
1629 case 24:
1630 return AMDGPU::SI_SPILL_A192_SAVE;
1631 case 28:
1632 return AMDGPU::SI_SPILL_A224_SAVE;
1633 case 32:
1634 return AMDGPU::SI_SPILL_A256_SAVE;
1635 case 36:
1636 return AMDGPU::SI_SPILL_A288_SAVE;
1637 case 40:
1638 return AMDGPU::SI_SPILL_A320_SAVE;
1639 case 44:
1640 return AMDGPU::SI_SPILL_A352_SAVE;
1641 case 48:
1642 return AMDGPU::SI_SPILL_A384_SAVE;
1643 case 64:
1644 return AMDGPU::SI_SPILL_A512_SAVE;
1645 case 128:
1646 return AMDGPU::SI_SPILL_A1024_SAVE;
1647 default:
1648 llvm_unreachable("unknown register size");
1649 }
1650}
1651
1652static unsigned getAVSpillSaveOpcode(unsigned Size) {
1653 switch (Size) {
1654 case 4:
1655 return AMDGPU::SI_SPILL_AV32_SAVE;
1656 case 8:
1657 return AMDGPU::SI_SPILL_AV64_SAVE;
1658 case 12:
1659 return AMDGPU::SI_SPILL_AV96_SAVE;
1660 case 16:
1661 return AMDGPU::SI_SPILL_AV128_SAVE;
1662 case 20:
1663 return AMDGPU::SI_SPILL_AV160_SAVE;
1664 case 24:
1665 return AMDGPU::SI_SPILL_AV192_SAVE;
1666 case 28:
1667 return AMDGPU::SI_SPILL_AV224_SAVE;
1668 case 32:
1669 return AMDGPU::SI_SPILL_AV256_SAVE;
1670 case 36:
1671 return AMDGPU::SI_SPILL_AV288_SAVE;
1672 case 40:
1673 return AMDGPU::SI_SPILL_AV320_SAVE;
1674 case 44:
1675 return AMDGPU::SI_SPILL_AV352_SAVE;
1676 case 48:
1677 return AMDGPU::SI_SPILL_AV384_SAVE;
1678 case 64:
1679 return AMDGPU::SI_SPILL_AV512_SAVE;
1680 case 128:
1681 return AMDGPU::SI_SPILL_AV1024_SAVE;
1682 default:
1683 llvm_unreachable("unknown register size");
1684 }
1685}
1686
1687static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1688 bool IsVectorSuperClass) {
1689 // Currently, there is only 32-bit WWM register spills needed.
1690 if (Size != 4)
1691 llvm_unreachable("unknown wwm register spill size");
1692
1693 if (IsVectorSuperClass)
1694 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1695
1696 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1697}
1698
1700 const TargetRegisterClass *RC,
1701 unsigned Size,
1702 const SIRegisterInfo &TRI,
1703 const SIMachineFunctionInfo &MFI) {
1704 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1705
1706 // Choose the right opcode if spilling a WWM register.
1708 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1709
1710 if (IsVectorSuperClass)
1711 return getAVSpillSaveOpcode(Size);
1712
1713 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1715}
1716
1719 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1720 const TargetRegisterInfo *TRI, Register VReg) const {
1723 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1724 const DebugLoc &DL = MBB.findDebugLoc(MI);
1725
1726 MachinePointerInfo PtrInfo
1727 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1729 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1730 FrameInfo.getObjectAlign(FrameIndex));
1731 unsigned SpillSize = TRI->getSpillSize(*RC);
1732
1734 if (RI.isSGPRClass(RC)) {
1735 MFI->setHasSpilledSGPRs();
1736 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1737 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1738 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1739
1740 // We are only allowed to create one new instruction when spilling
1741 // registers, so we need to use pseudo instruction for spilling SGPRs.
1742 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1743
1744 // The SGPR spill/restore instructions only work on number sgprs, so we need
1745 // to make sure we are using the correct register class.
1746 if (SrcReg.isVirtual() && SpillSize == 4) {
1747 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1748 }
1749
1750 BuildMI(MBB, MI, DL, OpDesc)
1751 .addReg(SrcReg, getKillRegState(isKill)) // data
1752 .addFrameIndex(FrameIndex) // addr
1753 .addMemOperand(MMO)
1755
1756 if (RI.spillSGPRToVGPR())
1757 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1758 return;
1759 }
1760
1761 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1762 SpillSize, RI, *MFI);
1763 MFI->setHasSpilledVGPRs();
1764
1765 BuildMI(MBB, MI, DL, get(Opcode))
1766 .addReg(SrcReg, getKillRegState(isKill)) // data
1767 .addFrameIndex(FrameIndex) // addr
1768 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1769 .addImm(0) // offset
1770 .addMemOperand(MMO);
1771}
1772
1773static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1774 switch (Size) {
1775 case 4:
1776 return AMDGPU::SI_SPILL_S32_RESTORE;
1777 case 8:
1778 return AMDGPU::SI_SPILL_S64_RESTORE;
1779 case 12:
1780 return AMDGPU::SI_SPILL_S96_RESTORE;
1781 case 16:
1782 return AMDGPU::SI_SPILL_S128_RESTORE;
1783 case 20:
1784 return AMDGPU::SI_SPILL_S160_RESTORE;
1785 case 24:
1786 return AMDGPU::SI_SPILL_S192_RESTORE;
1787 case 28:
1788 return AMDGPU::SI_SPILL_S224_RESTORE;
1789 case 32:
1790 return AMDGPU::SI_SPILL_S256_RESTORE;
1791 case 36:
1792 return AMDGPU::SI_SPILL_S288_RESTORE;
1793 case 40:
1794 return AMDGPU::SI_SPILL_S320_RESTORE;
1795 case 44:
1796 return AMDGPU::SI_SPILL_S352_RESTORE;
1797 case 48:
1798 return AMDGPU::SI_SPILL_S384_RESTORE;
1799 case 64:
1800 return AMDGPU::SI_SPILL_S512_RESTORE;
1801 case 128:
1802 return AMDGPU::SI_SPILL_S1024_RESTORE;
1803 default:
1804 llvm_unreachable("unknown register size");
1805 }
1806}
1807
1808static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1809 switch (Size) {
1810 case 4:
1811 return AMDGPU::SI_SPILL_V32_RESTORE;
1812 case 8:
1813 return AMDGPU::SI_SPILL_V64_RESTORE;
1814 case 12:
1815 return AMDGPU::SI_SPILL_V96_RESTORE;
1816 case 16:
1817 return AMDGPU::SI_SPILL_V128_RESTORE;
1818 case 20:
1819 return AMDGPU::SI_SPILL_V160_RESTORE;
1820 case 24:
1821 return AMDGPU::SI_SPILL_V192_RESTORE;
1822 case 28:
1823 return AMDGPU::SI_SPILL_V224_RESTORE;
1824 case 32:
1825 return AMDGPU::SI_SPILL_V256_RESTORE;
1826 case 36:
1827 return AMDGPU::SI_SPILL_V288_RESTORE;
1828 case 40:
1829 return AMDGPU::SI_SPILL_V320_RESTORE;
1830 case 44:
1831 return AMDGPU::SI_SPILL_V352_RESTORE;
1832 case 48:
1833 return AMDGPU::SI_SPILL_V384_RESTORE;
1834 case 64:
1835 return AMDGPU::SI_SPILL_V512_RESTORE;
1836 case 128:
1837 return AMDGPU::SI_SPILL_V1024_RESTORE;
1838 default:
1839 llvm_unreachable("unknown register size");
1840 }
1841}
1842
1843static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1844 switch (Size) {
1845 case 4:
1846 return AMDGPU::SI_SPILL_A32_RESTORE;
1847 case 8:
1848 return AMDGPU::SI_SPILL_A64_RESTORE;
1849 case 12:
1850 return AMDGPU::SI_SPILL_A96_RESTORE;
1851 case 16:
1852 return AMDGPU::SI_SPILL_A128_RESTORE;
1853 case 20:
1854 return AMDGPU::SI_SPILL_A160_RESTORE;
1855 case 24:
1856 return AMDGPU::SI_SPILL_A192_RESTORE;
1857 case 28:
1858 return AMDGPU::SI_SPILL_A224_RESTORE;
1859 case 32:
1860 return AMDGPU::SI_SPILL_A256_RESTORE;
1861 case 36:
1862 return AMDGPU::SI_SPILL_A288_RESTORE;
1863 case 40:
1864 return AMDGPU::SI_SPILL_A320_RESTORE;
1865 case 44:
1866 return AMDGPU::SI_SPILL_A352_RESTORE;
1867 case 48:
1868 return AMDGPU::SI_SPILL_A384_RESTORE;
1869 case 64:
1870 return AMDGPU::SI_SPILL_A512_RESTORE;
1871 case 128:
1872 return AMDGPU::SI_SPILL_A1024_RESTORE;
1873 default:
1874 llvm_unreachable("unknown register size");
1875 }
1876}
1877
1878static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1879 switch (Size) {
1880 case 4:
1881 return AMDGPU::SI_SPILL_AV32_RESTORE;
1882 case 8:
1883 return AMDGPU::SI_SPILL_AV64_RESTORE;
1884 case 12:
1885 return AMDGPU::SI_SPILL_AV96_RESTORE;
1886 case 16:
1887 return AMDGPU::SI_SPILL_AV128_RESTORE;
1888 case 20:
1889 return AMDGPU::SI_SPILL_AV160_RESTORE;
1890 case 24:
1891 return AMDGPU::SI_SPILL_AV192_RESTORE;
1892 case 28:
1893 return AMDGPU::SI_SPILL_AV224_RESTORE;
1894 case 32:
1895 return AMDGPU::SI_SPILL_AV256_RESTORE;
1896 case 36:
1897 return AMDGPU::SI_SPILL_AV288_RESTORE;
1898 case 40:
1899 return AMDGPU::SI_SPILL_AV320_RESTORE;
1900 case 44:
1901 return AMDGPU::SI_SPILL_AV352_RESTORE;
1902 case 48:
1903 return AMDGPU::SI_SPILL_AV384_RESTORE;
1904 case 64:
1905 return AMDGPU::SI_SPILL_AV512_RESTORE;
1906 case 128:
1907 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1908 default:
1909 llvm_unreachable("unknown register size");
1910 }
1911}
1912
1913static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1914 bool IsVectorSuperClass) {
1915 // Currently, there is only 32-bit WWM register spills needed.
1916 if (Size != 4)
1917 llvm_unreachable("unknown wwm register spill size");
1918
1919 if (IsVectorSuperClass)
1920 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1921
1922 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1923}
1924
1925static unsigned
1927 unsigned Size, const SIRegisterInfo &TRI,
1928 const SIMachineFunctionInfo &MFI) {
1929 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1930
1931 // Choose the right opcode if restoring a WWM register.
1933 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1934
1935 if (IsVectorSuperClass)
1937
1938 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1940}
1941
1944 Register DestReg, int FrameIndex,
1945 const TargetRegisterClass *RC,
1946 const TargetRegisterInfo *TRI,
1947 Register VReg) const {
1950 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1951 const DebugLoc &DL = MBB.findDebugLoc(MI);
1952 unsigned SpillSize = TRI->getSpillSize(*RC);
1953
1954 MachinePointerInfo PtrInfo
1955 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1956
1958 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1959 FrameInfo.getObjectAlign(FrameIndex));
1960
1961 if (RI.isSGPRClass(RC)) {
1962 MFI->setHasSpilledSGPRs();
1963 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1964 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1965 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1966
1967 // FIXME: Maybe this should not include a memoperand because it will be
1968 // lowered to non-memory instructions.
1969 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1970 if (DestReg.isVirtual() && SpillSize == 4) {
1972 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1973 }
1974
1975 if (RI.spillSGPRToVGPR())
1976 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1977 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1978 .addFrameIndex(FrameIndex) // addr
1979 .addMemOperand(MMO)
1981
1982 return;
1983 }
1984
1985 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1986 SpillSize, RI, *MFI);
1987 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1988 .addFrameIndex(FrameIndex) // vaddr
1989 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1990 .addImm(0) // offset
1991 .addMemOperand(MMO);
1992}
1993
1996 insertNoops(MBB, MI, 1);
1997}
1998
2001 unsigned Quantity) const {
2003 while (Quantity > 0) {
2004 unsigned Arg = std::min(Quantity, 8u);
2005 Quantity -= Arg;
2006 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2007 }
2008}
2009
2011 auto MF = MBB.getParent();
2013
2014 assert(Info->isEntryFunction());
2015
2016 if (MBB.succ_empty()) {
2017 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2018 if (HasNoTerminator) {
2019 if (Info->returnsVoid()) {
2020 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2021 } else {
2022 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2023 }
2024 }
2025 }
2026}
2027
2031 const DebugLoc &DL) const {
2033 constexpr unsigned DoorbellIDMask = 0x3ff;
2034 constexpr unsigned ECQueueWaveAbort = 0x400;
2035
2036 MachineBasicBlock *TrapBB = &MBB;
2037 MachineBasicBlock *ContBB = &MBB;
2038 MachineBasicBlock *HaltLoopBB = MF->CreateMachineBasicBlock();
2039
2040 if (!MBB.succ_empty() || std::next(MI.getIterator()) != MBB.end()) {
2041 ContBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2042 TrapBB = MF->CreateMachineBasicBlock();
2043 BuildMI(MBB, MI, DL, get(AMDGPU::S_CBRANCH_EXECNZ)).addMBB(TrapBB);
2044 MF->push_back(TrapBB);
2045 MBB.addSuccessor(TrapBB);
2046 }
2047
2048 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2049 // will be a nop.
2050 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_TRAP))
2051 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2052 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2053 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG_RTN_B32),
2054 DoorbellReg)
2056 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2057 .addUse(AMDGPU::M0);
2058 Register DoorbellRegMasked =
2059 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2060 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2061 .addUse(DoorbellReg)
2062 .addImm(DoorbellIDMask);
2063 Register SetWaveAbortBit =
2064 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2065 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2066 .addUse(DoorbellRegMasked)
2067 .addImm(ECQueueWaveAbort);
2068 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2069 .addUse(SetWaveAbortBit);
2070 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_SENDMSG))
2072 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2073 .addUse(AMDGPU::TTMP2);
2074 BuildMI(*TrapBB, TrapBB->end(), DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoopBB);
2075 TrapBB->addSuccessor(HaltLoopBB);
2076
2077 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2078 BuildMI(*HaltLoopBB, HaltLoopBB->end(), DL, get(AMDGPU::S_BRANCH))
2079 .addMBB(HaltLoopBB);
2080 MF->push_back(HaltLoopBB);
2081 HaltLoopBB->addSuccessor(HaltLoopBB);
2082
2083 return ContBB;
2084}
2085
2087 switch (MI.getOpcode()) {
2088 default:
2089 if (MI.isMetaInstruction())
2090 return 0;
2091 return 1; // FIXME: Do wait states equal cycles?
2092
2093 case AMDGPU::S_NOP:
2094 return MI.getOperand(0).getImm() + 1;
2095 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2096 // hazard, even if one exist, won't really be visible. Should we handle it?
2097 }
2098}
2099
2101 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2102 MachineBasicBlock &MBB = *MI.getParent();
2104 switch (MI.getOpcode()) {
2105 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2106 case AMDGPU::S_MOV_B64_term:
2107 // This is only a terminator to get the correct spill code placement during
2108 // register allocation.
2109 MI.setDesc(get(AMDGPU::S_MOV_B64));
2110 break;
2111
2112 case AMDGPU::S_MOV_B32_term:
2113 // This is only a terminator to get the correct spill code placement during
2114 // register allocation.
2115 MI.setDesc(get(AMDGPU::S_MOV_B32));
2116 break;
2117
2118 case AMDGPU::S_XOR_B64_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(AMDGPU::S_XOR_B64));
2122 break;
2123
2124 case AMDGPU::S_XOR_B32_term:
2125 // This is only a terminator to get the correct spill code placement during
2126 // register allocation.
2127 MI.setDesc(get(AMDGPU::S_XOR_B32));
2128 break;
2129 case AMDGPU::S_OR_B64_term:
2130 // This is only a terminator to get the correct spill code placement during
2131 // register allocation.
2132 MI.setDesc(get(AMDGPU::S_OR_B64));
2133 break;
2134 case AMDGPU::S_OR_B32_term:
2135 // This is only a terminator to get the correct spill code placement during
2136 // register allocation.
2137 MI.setDesc(get(AMDGPU::S_OR_B32));
2138 break;
2139
2140 case AMDGPU::S_ANDN2_B64_term:
2141 // This is only a terminator to get the correct spill code placement during
2142 // register allocation.
2143 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2144 break;
2145
2146 case AMDGPU::S_ANDN2_B32_term:
2147 // This is only a terminator to get the correct spill code placement during
2148 // register allocation.
2149 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2150 break;
2151
2152 case AMDGPU::S_AND_B64_term:
2153 // This is only a terminator to get the correct spill code placement during
2154 // register allocation.
2155 MI.setDesc(get(AMDGPU::S_AND_B64));
2156 break;
2157
2158 case AMDGPU::S_AND_B32_term:
2159 // This is only a terminator to get the correct spill code placement during
2160 // register allocation.
2161 MI.setDesc(get(AMDGPU::S_AND_B32));
2162 break;
2163
2164 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2165 // This is only a terminator to get the correct spill code placement during
2166 // register allocation.
2167 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2168 break;
2169
2170 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2171 // This is only a terminator to get the correct spill code placement during
2172 // register allocation.
2173 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2174 break;
2175
2176 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2177 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2178 break;
2179
2180 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2181 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2182 break;
2183
2184 case AMDGPU::V_MOV_B64_PSEUDO: {
2185 Register Dst = MI.getOperand(0).getReg();
2186 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2187 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2188
2189 const MachineOperand &SrcOp = MI.getOperand(1);
2190 // FIXME: Will this work for 64-bit floating point immediates?
2191 assert(!SrcOp.isFPImm());
2192 if (ST.hasMovB64()) {
2193 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2194 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2195 isUInt<32>(SrcOp.getImm()))
2196 break;
2197 }
2198 if (SrcOp.isImm()) {
2199 APInt Imm(64, SrcOp.getImm());
2200 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2201 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2202 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2203 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2205 .addImm(Lo.getSExtValue())
2207 .addImm(Lo.getSExtValue())
2208 .addImm(0) // op_sel_lo
2209 .addImm(0) // op_sel_hi
2210 .addImm(0) // neg_lo
2211 .addImm(0) // neg_hi
2212 .addImm(0); // clamp
2213 } else {
2214 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2215 .addImm(Lo.getSExtValue())
2217 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2218 .addImm(Hi.getSExtValue())
2220 }
2221 } else {
2222 assert(SrcOp.isReg());
2223 if (ST.hasPkMovB32() &&
2224 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2225 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2226 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2227 .addReg(SrcOp.getReg())
2229 .addReg(SrcOp.getReg())
2230 .addImm(0) // op_sel_lo
2231 .addImm(0) // op_sel_hi
2232 .addImm(0) // neg_lo
2233 .addImm(0) // neg_hi
2234 .addImm(0); // clamp
2235 } else {
2236 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2237 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2239 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2240 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2242 }
2243 }
2244 MI.eraseFromParent();
2245 break;
2246 }
2247 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2249 break;
2250 }
2251 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2252 const MachineOperand &SrcOp = MI.getOperand(1);
2253 assert(!SrcOp.isFPImm());
2254 APInt Imm(64, SrcOp.getImm());
2255 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2256 MI.setDesc(get(AMDGPU::S_MOV_B64));
2257 break;
2258 }
2259
2260 Register Dst = MI.getOperand(0).getReg();
2261 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2262 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2263
2264 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2265 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2266 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2267 .addImm(Lo.getSExtValue())
2269 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2270 .addImm(Hi.getSExtValue())
2272 MI.eraseFromParent();
2273 break;
2274 }
2275 case AMDGPU::V_SET_INACTIVE_B32: {
2276 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2277 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2278 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2279 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2280 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2281 .add(MI.getOperand(1));
2282 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2283 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2284 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2285 .add(MI.getOperand(2));
2286 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2287 .addReg(Exec);
2288 MI.eraseFromParent();
2289 break;
2290 }
2291 case AMDGPU::V_SET_INACTIVE_B64: {
2292 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2293 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2294 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2295 MI.getOperand(0).getReg())
2296 .add(MI.getOperand(1));
2297 expandPostRAPseudo(*Copy);
2298 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2299 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2300 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2301 MI.getOperand(0).getReg())
2302 .add(MI.getOperand(2));
2303 expandPostRAPseudo(*Copy);
2304 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2305 .addReg(Exec);
2306 MI.eraseFromParent();
2307 break;
2308 }
2309 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2310 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2315 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2316 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2317 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2318 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2319 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2320 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2325 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2326 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2327 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2328 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2329 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2330 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2331 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2332 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2333 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2334 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2335 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2336 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2337 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2338 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2339
2340 unsigned Opc;
2341 if (RI.hasVGPRs(EltRC)) {
2342 Opc = AMDGPU::V_MOVRELD_B32_e32;
2343 } else {
2344 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2345 : AMDGPU::S_MOVRELD_B32;
2346 }
2347
2348 const MCInstrDesc &OpDesc = get(Opc);
2349 Register VecReg = MI.getOperand(0).getReg();
2350 bool IsUndef = MI.getOperand(1).isUndef();
2351 unsigned SubReg = MI.getOperand(3).getImm();
2352 assert(VecReg == MI.getOperand(1).getReg());
2353
2355 BuildMI(MBB, MI, DL, OpDesc)
2356 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2357 .add(MI.getOperand(2))
2359 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2360
2361 const int ImpDefIdx =
2362 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2363 const int ImpUseIdx = ImpDefIdx + 1;
2364 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2365 MI.eraseFromParent();
2366 break;
2367 }
2368 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2369 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2370 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2371 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2372 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2373 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2374 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2375 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2376 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2377 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2378 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2379 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2381 Register VecReg = MI.getOperand(0).getReg();
2382 bool IsUndef = MI.getOperand(1).isUndef();
2383 Register Idx = MI.getOperand(3).getReg();
2384 Register SubReg = MI.getOperand(4).getImm();
2385
2386 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2387 .addReg(Idx)
2389 SetOn->getOperand(3).setIsUndef();
2390
2391 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2393 BuildMI(MBB, MI, DL, OpDesc)
2394 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2395 .add(MI.getOperand(2))
2397 .addReg(VecReg,
2398 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2399
2400 const int ImpDefIdx =
2401 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2402 const int ImpUseIdx = ImpDefIdx + 1;
2403 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2404
2405 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2406
2407 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2408
2409 MI.eraseFromParent();
2410 break;
2411 }
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2414 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2415 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2416 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2417 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2418 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2419 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2420 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2421 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2422 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2423 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2425 Register Dst = MI.getOperand(0).getReg();
2426 Register VecReg = MI.getOperand(1).getReg();
2427 bool IsUndef = MI.getOperand(1).isUndef();
2428 Register Idx = MI.getOperand(2).getReg();
2429 Register SubReg = MI.getOperand(3).getImm();
2430
2431 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2432 .addReg(Idx)
2434 SetOn->getOperand(3).setIsUndef();
2435
2436 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2437 .addDef(Dst)
2438 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2439 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2440
2441 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2442
2443 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2444
2445 MI.eraseFromParent();
2446 break;
2447 }
2448 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2449 MachineFunction &MF = *MBB.getParent();
2450 Register Reg = MI.getOperand(0).getReg();
2451 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2452 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2453 MachineOperand OpLo = MI.getOperand(1);
2454 MachineOperand OpHi = MI.getOperand(2);
2455
2456 // Create a bundle so these instructions won't be re-ordered by the
2457 // post-RA scheduler.
2458 MIBundleBuilder Bundler(MBB, MI);
2459 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2460
2461 // What we want here is an offset from the value returned by s_getpc (which
2462 // is the address of the s_add_u32 instruction) to the global variable, but
2463 // since the encoding of $symbol starts 4 bytes after the start of the
2464 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2465 // small. This requires us to add 4 to the global variable offset in order
2466 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2467 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2468 // instruction.
2469
2470 int64_t Adjust = 0;
2471 if (ST.hasGetPCZeroExtension()) {
2472 // Fix up hardware that does not sign-extend the 48-bit PC value by
2473 // inserting: s_sext_i32_i16 reghi, reghi
2474 Bundler.append(
2475 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2476 Adjust += 4;
2477 }
2478
2479 if (OpLo.isGlobal())
2480 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2481 Bundler.append(
2482 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2483
2484 if (OpHi.isGlobal())
2485 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2486 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2487 .addReg(RegHi)
2488 .add(OpHi));
2489
2490 finalizeBundle(MBB, Bundler.begin());
2491
2492 MI.eraseFromParent();
2493 break;
2494 }
2495 case AMDGPU::ENTER_STRICT_WWM: {
2496 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2497 // Whole Wave Mode is entered.
2498 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2499 : AMDGPU::S_OR_SAVEEXEC_B64));
2500 break;
2501 }
2502 case AMDGPU::ENTER_STRICT_WQM: {
2503 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2504 // STRICT_WQM is entered.
2505 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2506 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2507 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2508 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2509 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2510
2511 MI.eraseFromParent();
2512 break;
2513 }
2514 case AMDGPU::EXIT_STRICT_WWM:
2515 case AMDGPU::EXIT_STRICT_WQM: {
2516 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2517 // WWM/STICT_WQM is exited.
2518 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2519 break;
2520 }
2521 case AMDGPU::SI_RETURN: {
2522 const MachineFunction *MF = MBB.getParent();
2523 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2524 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2525 // Hiding the return address use with SI_RETURN may lead to extra kills in
2526 // the function and missing live-ins. We are fine in practice because callee
2527 // saved register handling ensures the register value is restored before
2528 // RET, but we need the undef flag here to appease the MachineVerifier
2529 // liveness checks.
2531 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2532 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2533
2534 MIB.copyImplicitOps(MI);
2535 MI.eraseFromParent();
2536 break;
2537 }
2538
2539 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2540 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2541 MI.setDesc(get(AMDGPU::S_MUL_U64));
2542 break;
2543
2544 case AMDGPU::S_GETPC_B64_pseudo:
2545 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2546 if (ST.hasGetPCZeroExtension()) {
2547 Register Dst = MI.getOperand(0).getReg();
2548 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2549 // Fix up hardware that does not sign-extend the 48-bit PC value by
2550 // inserting: s_sext_i32_i16 dsthi, dsthi
2551 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2552 DstHi)
2553 .addReg(DstHi);
2554 }
2555 break;
2556 }
2557 return true;
2558}
2559
2562 unsigned SubIdx, const MachineInstr &Orig,
2563 const TargetRegisterInfo &RI) const {
2564
2565 // Try shrinking the instruction to remat only the part needed for current
2566 // context.
2567 // TODO: Handle more cases.
2568 unsigned Opcode = Orig.getOpcode();
2569 switch (Opcode) {
2570 case AMDGPU::S_LOAD_DWORDX16_IMM:
2571 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2572 if (SubIdx != 0)
2573 break;
2574
2575 if (I == MBB.end())
2576 break;
2577
2578 if (I->isBundled())
2579 break;
2580
2581 // Look for a single use of the register that is also a subreg.
2582 Register RegToFind = Orig.getOperand(0).getReg();
2583 MachineOperand *UseMO = nullptr;
2584 for (auto &CandMO : I->operands()) {
2585 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2586 continue;
2587 if (UseMO) {
2588 UseMO = nullptr;
2589 break;
2590 }
2591 UseMO = &CandMO;
2592 }
2593 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2594 break;
2595
2596 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2597 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2598
2601 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2602
2603 unsigned NewOpcode = -1;
2604 if (SubregSize == 256)
2605 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2606 else if (SubregSize == 128)
2607 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2608 else
2609 break;
2610
2611 const MCInstrDesc &TID = get(NewOpcode);
2612 const TargetRegisterClass *NewRC =
2613 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2614 MRI.setRegClass(DestReg, NewRC);
2615
2616 UseMO->setReg(DestReg);
2617 UseMO->setSubReg(AMDGPU::NoSubRegister);
2618
2619 // Use a smaller load with the desired size, possibly with updated offset.
2620 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2621 MI->setDesc(TID);
2622 MI->getOperand(0).setReg(DestReg);
2623 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2624 if (Offset) {
2625 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2626 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2627 OffsetMO->setImm(FinalOffset);
2628 }
2630 for (const MachineMemOperand *MemOp : Orig.memoperands())
2631 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2632 SubregSize / 8));
2633 MI->setMemRefs(*MF, NewMMOs);
2634
2635 MBB.insert(I, MI);
2636 return;
2637 }
2638
2639 default:
2640 break;
2641 }
2642
2643 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2644}
2645
2646std::pair<MachineInstr*, MachineInstr*>
2648 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2649
2650 if (ST.hasMovB64() &&
2652 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2653 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2654 return std::pair(&MI, nullptr);
2655 }
2656
2657 MachineBasicBlock &MBB = *MI.getParent();
2661 Register Dst = MI.getOperand(0).getReg();
2662 unsigned Part = 0;
2663 MachineInstr *Split[2];
2664
2665 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2666 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2667 if (Dst.isPhysical()) {
2668 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2669 } else {
2670 assert(MRI.isSSA());
2671 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2672 MovDPP.addDef(Tmp);
2673 }
2674
2675 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2676 const MachineOperand &SrcOp = MI.getOperand(I);
2677 assert(!SrcOp.isFPImm());
2678 if (SrcOp.isImm()) {
2679 APInt Imm(64, SrcOp.getImm());
2680 Imm.ashrInPlace(Part * 32);
2681 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2682 } else {
2683 assert(SrcOp.isReg());
2684 Register Src = SrcOp.getReg();
2685 if (Src.isPhysical())
2686 MovDPP.addReg(RI.getSubReg(Src, Sub));
2687 else
2688 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2689 }
2690 }
2691
2692 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2693 MovDPP.addImm(MO.getImm());
2694
2695 Split[Part] = MovDPP;
2696 ++Part;
2697 }
2698
2699 if (Dst.isVirtual())
2700 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2701 .addReg(Split[0]->getOperand(0).getReg())
2702 .addImm(AMDGPU::sub0)
2703 .addReg(Split[1]->getOperand(0).getReg())
2704 .addImm(AMDGPU::sub1);
2705
2706 MI.eraseFromParent();
2707 return std::pair(Split[0], Split[1]);
2708}
2709
2710std::optional<DestSourcePair>
2712 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2713 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2714
2715 return std::nullopt;
2716}
2717
2719 MachineOperand &Src0,
2720 unsigned Src0OpName,
2721 MachineOperand &Src1,
2722 unsigned Src1OpName) const {
2723 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2724 if (!Src0Mods)
2725 return false;
2726
2727 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2728 assert(Src1Mods &&
2729 "All commutable instructions have both src0 and src1 modifiers");
2730
2731 int Src0ModsVal = Src0Mods->getImm();
2732 int Src1ModsVal = Src1Mods->getImm();
2733
2734 Src1Mods->setImm(Src0ModsVal);
2735 Src0Mods->setImm(Src1ModsVal);
2736 return true;
2737}
2738
2740 MachineOperand &RegOp,
2741 MachineOperand &NonRegOp) {
2742 Register Reg = RegOp.getReg();
2743 unsigned SubReg = RegOp.getSubReg();
2744 bool IsKill = RegOp.isKill();
2745 bool IsDead = RegOp.isDead();
2746 bool IsUndef = RegOp.isUndef();
2747 bool IsDebug = RegOp.isDebug();
2748
2749 if (NonRegOp.isImm())
2750 RegOp.ChangeToImmediate(NonRegOp.getImm());
2751 else if (NonRegOp.isFI())
2752 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2753 else if (NonRegOp.isGlobal()) {
2754 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2755 NonRegOp.getTargetFlags());
2756 } else
2757 return nullptr;
2758
2759 // Make sure we don't reinterpret a subreg index in the target flags.
2760 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2761
2762 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2763 NonRegOp.setSubReg(SubReg);
2764
2765 return &MI;
2766}
2767
2769 unsigned Src0Idx,
2770 unsigned Src1Idx) const {
2771 assert(!NewMI && "this should never be used");
2772
2773 unsigned Opc = MI.getOpcode();
2774 int CommutedOpcode = commuteOpcode(Opc);
2775 if (CommutedOpcode == -1)
2776 return nullptr;
2777
2778 if (Src0Idx > Src1Idx)
2779 std::swap(Src0Idx, Src1Idx);
2780
2781 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2782 static_cast<int>(Src0Idx) &&
2783 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2784 static_cast<int>(Src1Idx) &&
2785 "inconsistency with findCommutedOpIndices");
2786
2787 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2788 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2789
2790 MachineInstr *CommutedMI = nullptr;
2791 if (Src0.isReg() && Src1.isReg()) {
2792 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2793 // Be sure to copy the source modifiers to the right place.
2794 CommutedMI
2795 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2796 }
2797
2798 } else if (Src0.isReg() && !Src1.isReg()) {
2799 // src0 should always be able to support any operand type, so no need to
2800 // check operand legality.
2801 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2802 } else if (!Src0.isReg() && Src1.isReg()) {
2803 if (isOperandLegal(MI, Src1Idx, &Src0))
2804 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2805 } else {
2806 // FIXME: Found two non registers to commute. This does happen.
2807 return nullptr;
2808 }
2809
2810 if (CommutedMI) {
2811 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2812 Src1, AMDGPU::OpName::src1_modifiers);
2813
2814 CommutedMI->setDesc(get(CommutedOpcode));
2815 }
2816
2817 return CommutedMI;
2818}
2819
2820// This needs to be implemented because the source modifiers may be inserted
2821// between the true commutable operands, and the base
2822// TargetInstrInfo::commuteInstruction uses it.
2824 unsigned &SrcOpIdx0,
2825 unsigned &SrcOpIdx1) const {
2826 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2827}
2828
2830 unsigned &SrcOpIdx0,
2831 unsigned &SrcOpIdx1) const {
2832 if (!Desc.isCommutable())
2833 return false;
2834
2835 unsigned Opc = Desc.getOpcode();
2836 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2837 if (Src0Idx == -1)
2838 return false;
2839
2840 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2841 if (Src1Idx == -1)
2842 return false;
2843
2844 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2845}
2846
2848 int64_t BrOffset) const {
2849 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2850 // block is unanalyzable.
2851 assert(BranchOp != AMDGPU::S_SETPC_B64);
2852
2853 // Convert to dwords.
2854 BrOffset /= 4;
2855
2856 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2857 // from the next instruction.
2858 BrOffset -= 1;
2859
2860 return isIntN(BranchOffsetBits, BrOffset);
2861}
2862
2865 return MI.getOperand(0).getMBB();
2866}
2867
2869 for (const MachineInstr &MI : MBB->terminators()) {
2870 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
2871 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2872 MI.getOpcode() == AMDGPU::SI_LOOP)
2873 return true;
2874 }
2875 return false;
2876}
2877
2879 MachineBasicBlock &DestBB,
2880 MachineBasicBlock &RestoreBB,
2881 const DebugLoc &DL, int64_t BrOffset,
2882 RegScavenger *RS) const {
2883 assert(RS && "RegScavenger required for long branching");
2884 assert(MBB.empty() &&
2885 "new block should be inserted for expanding unconditional branch");
2886 assert(MBB.pred_size() == 1);
2887 assert(RestoreBB.empty() &&
2888 "restore block should be inserted for restoring clobbered registers");
2889
2893
2894 // FIXME: Virtual register workaround for RegScavenger not working with empty
2895 // blocks.
2896 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2897
2898 auto I = MBB.end();
2899
2900 // We need to compute the offset relative to the instruction immediately after
2901 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2902 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2903
2904 auto &MCCtx = MF->getContext();
2905 MCSymbol *PostGetPCLabel =
2906 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2907 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2908
2909 MCSymbol *OffsetLo =
2910 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2911 MCSymbol *OffsetHi =
2912 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2913 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2914 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2915 .addReg(PCReg, 0, AMDGPU::sub0)
2916 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2917 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2918 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2919 .addReg(PCReg, 0, AMDGPU::sub1)
2920 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2921
2922 // Insert the indirect branch after the other terminator.
2923 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2924 .addReg(PCReg);
2925
2926 // If a spill is needed for the pc register pair, we need to insert a spill
2927 // restore block right before the destination block, and insert a short branch
2928 // into the old destination block's fallthrough predecessor.
2929 // e.g.:
2930 //
2931 // s_cbranch_scc0 skip_long_branch:
2932 //
2933 // long_branch_bb:
2934 // spill s[8:9]
2935 // s_getpc_b64 s[8:9]
2936 // s_add_u32 s8, s8, restore_bb
2937 // s_addc_u32 s9, s9, 0
2938 // s_setpc_b64 s[8:9]
2939 //
2940 // skip_long_branch:
2941 // foo;
2942 //
2943 // .....
2944 //
2945 // dest_bb_fallthrough_predecessor:
2946 // bar;
2947 // s_branch dest_bb
2948 //
2949 // restore_bb:
2950 // restore s[8:9]
2951 // fallthrough dest_bb
2952 ///
2953 // dest_bb:
2954 // buzz;
2955
2956 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2957 Register Scav;
2958
2959 // If we've previously reserved a register for long branches
2960 // avoid running the scavenger and just use those registers
2961 if (LongBranchReservedReg) {
2962 RS->enterBasicBlock(MBB);
2963 Scav = LongBranchReservedReg;
2964 } else {
2966 Scav = RS->scavengeRegisterBackwards(
2967 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2968 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2969 }
2970 if (Scav) {
2971 RS->setRegUsed(Scav);
2972 MRI.replaceRegWith(PCReg, Scav);
2973 MRI.clearVirtRegs();
2974 } else {
2975 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2976 // SGPR spill.
2977 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2978 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2979 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2980 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2981 MRI.clearVirtRegs();
2982 }
2983
2984 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2985 // Now, the distance could be defined.
2987 MCSymbolRefExpr::create(DestLabel, MCCtx),
2988 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2989 // Add offset assignments.
2990 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2991 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2992 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2993 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2994}
2995
2996unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2997 switch (Cond) {
2998 case SIInstrInfo::SCC_TRUE:
2999 return AMDGPU::S_CBRANCH_SCC1;
3000 case SIInstrInfo::SCC_FALSE:
3001 return AMDGPU::S_CBRANCH_SCC0;
3002 case SIInstrInfo::VCCNZ:
3003 return AMDGPU::S_CBRANCH_VCCNZ;
3004 case SIInstrInfo::VCCZ:
3005 return AMDGPU::S_CBRANCH_VCCZ;
3006 case SIInstrInfo::EXECNZ:
3007 return AMDGPU::S_CBRANCH_EXECNZ;
3008 case SIInstrInfo::EXECZ:
3009 return AMDGPU::S_CBRANCH_EXECZ;
3010 default:
3011 llvm_unreachable("invalid branch predicate");
3012 }
3013}
3014
3015SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3016 switch (Opcode) {
3017 case AMDGPU::S_CBRANCH_SCC0:
3018 return SCC_FALSE;
3019 case AMDGPU::S_CBRANCH_SCC1:
3020 return SCC_TRUE;
3021 case AMDGPU::S_CBRANCH_VCCNZ:
3022 return VCCNZ;
3023 case AMDGPU::S_CBRANCH_VCCZ:
3024 return VCCZ;
3025 case AMDGPU::S_CBRANCH_EXECNZ:
3026 return EXECNZ;
3027 case AMDGPU::S_CBRANCH_EXECZ:
3028 return EXECZ;
3029 default:
3030 return INVALID_BR;
3031 }
3032}
3033
3037 MachineBasicBlock *&FBB,
3039 bool AllowModify) const {
3040 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3041 // Unconditional Branch
3042 TBB = I->getOperand(0).getMBB();
3043 return false;
3044 }
3045
3046 MachineBasicBlock *CondBB = nullptr;
3047
3048 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
3049 CondBB = I->getOperand(1).getMBB();
3050 Cond.push_back(I->getOperand(0));
3051 } else {
3052 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3053 if (Pred == INVALID_BR)
3054 return true;
3055
3056 CondBB = I->getOperand(0).getMBB();
3057 Cond.push_back(MachineOperand::CreateImm(Pred));
3058 Cond.push_back(I->getOperand(1)); // Save the branch register.
3059 }
3060 ++I;
3061
3062 if (I == MBB.end()) {
3063 // Conditional branch followed by fall-through.
3064 TBB = CondBB;
3065 return false;
3066 }
3067
3068 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3069 TBB = CondBB;
3070 FBB = I->getOperand(0).getMBB();
3071 return false;
3072 }
3073
3074 return true;
3075}
3076
3078 MachineBasicBlock *&FBB,
3080 bool AllowModify) const {
3082 auto E = MBB.end();
3083 if (I == E)
3084 return false;
3085
3086 // Skip over the instructions that are artificially terminators for special
3087 // exec management.
3088 while (I != E && !I->isBranch() && !I->isReturn()) {
3089 switch (I->getOpcode()) {
3090 case AMDGPU::S_MOV_B64_term:
3091 case AMDGPU::S_XOR_B64_term:
3092 case AMDGPU::S_OR_B64_term:
3093 case AMDGPU::S_ANDN2_B64_term:
3094 case AMDGPU::S_AND_B64_term:
3095 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3096 case AMDGPU::S_MOV_B32_term:
3097 case AMDGPU::S_XOR_B32_term:
3098 case AMDGPU::S_OR_B32_term:
3099 case AMDGPU::S_ANDN2_B32_term:
3100 case AMDGPU::S_AND_B32_term:
3101 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3102 break;
3103 case AMDGPU::SI_IF:
3104 case AMDGPU::SI_ELSE:
3105 case AMDGPU::SI_KILL_I1_TERMINATOR:
3106 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3107 // FIXME: It's messy that these need to be considered here at all.
3108 return true;
3109 default:
3110 llvm_unreachable("unexpected non-branch terminator inst");
3111 }
3112
3113 ++I;
3114 }
3115
3116 if (I == E)
3117 return false;
3118
3119 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3120}
3121
3123 int *BytesRemoved) const {
3124 unsigned Count = 0;
3125 unsigned RemovedSize = 0;
3127 // Skip over artificial terminators when removing instructions.
3128 if (MI.isBranch() || MI.isReturn()) {
3129 RemovedSize += getInstSizeInBytes(MI);
3130 MI.eraseFromParent();
3131 ++Count;
3132 }
3133 }
3134
3135 if (BytesRemoved)
3136 *BytesRemoved = RemovedSize;
3137
3138 return Count;
3139}
3140
3141// Copy the flags onto the implicit condition register operand.
3143 const MachineOperand &OrigCond) {
3144 CondReg.setIsUndef(OrigCond.isUndef());
3145 CondReg.setIsKill(OrigCond.isKill());
3146}
3147
3150 MachineBasicBlock *FBB,
3152 const DebugLoc &DL,
3153 int *BytesAdded) const {
3154 if (!FBB && Cond.empty()) {
3155 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3156 .addMBB(TBB);
3157 if (BytesAdded)
3158 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3159 return 1;
3160 }
3161
3162 if(Cond.size() == 1 && Cond[0].isReg()) {
3163 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
3164 .add(Cond[0])
3165 .addMBB(TBB);
3166 return 1;
3167 }
3168
3169 assert(TBB && Cond[0].isImm());
3170
3171 unsigned Opcode
3172 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3173
3174 if (!FBB) {
3175 MachineInstr *CondBr =
3176 BuildMI(&MBB, DL, get(Opcode))
3177 .addMBB(TBB);
3178
3179 // Copy the flags onto the implicit condition register operand.
3180 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3181 fixImplicitOperands(*CondBr);
3182
3183 if (BytesAdded)
3184 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3185 return 1;
3186 }
3187
3188 assert(TBB && FBB);
3189
3190 MachineInstr *CondBr =
3191 BuildMI(&MBB, DL, get(Opcode))
3192 .addMBB(TBB);
3193 fixImplicitOperands(*CondBr);
3194 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3195 .addMBB(FBB);
3196
3197 MachineOperand &CondReg = CondBr->getOperand(1);
3198 CondReg.setIsUndef(Cond[1].isUndef());
3199 CondReg.setIsKill(Cond[1].isKill());
3200
3201 if (BytesAdded)
3202 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3203
3204 return 2;
3205}
3206
3209 if (Cond.size() != 2) {
3210 return true;
3211 }
3212
3213 if (Cond[0].isImm()) {
3214 Cond[0].setImm(-Cond[0].getImm());
3215 return false;
3216 }
3217
3218 return true;
3219}
3220
3223 Register DstReg, Register TrueReg,
3224 Register FalseReg, int &CondCycles,
3225 int &TrueCycles, int &FalseCycles) const {
3226 switch (Cond[0].getImm()) {
3227 case VCCNZ:
3228 case VCCZ: {
3230 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3231 if (MRI.getRegClass(FalseReg) != RC)
3232 return false;
3233
3234 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3235 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3236
3237 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3238 return RI.hasVGPRs(RC) && NumInsts <= 6;
3239 }
3240 case SCC_TRUE:
3241 case SCC_FALSE: {
3242 // FIXME: We could insert for VGPRs if we could replace the original compare
3243 // with a vector one.
3245 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3246 if (MRI.getRegClass(FalseReg) != RC)
3247 return false;
3248
3249 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3250
3251 // Multiples of 8 can do s_cselect_b64
3252 if (NumInsts % 2 == 0)
3253 NumInsts /= 2;
3254
3255 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3256 return RI.isSGPRClass(RC);
3257 }
3258 default:
3259 return false;
3260 }
3261}
3262
3266 Register TrueReg, Register FalseReg) const {
3267 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3268 if (Pred == VCCZ || Pred == SCC_FALSE) {
3269 Pred = static_cast<BranchPredicate>(-Pred);
3270 std::swap(TrueReg, FalseReg);
3271 }
3272
3274 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3275 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3276
3277 if (DstSize == 32) {
3279 if (Pred == SCC_TRUE) {
3280 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3281 .addReg(TrueReg)
3282 .addReg(FalseReg);
3283 } else {
3284 // Instruction's operands are backwards from what is expected.
3285 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3286 .addReg(FalseReg)
3287 .addReg(TrueReg);
3288 }
3289
3290 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3291 return;
3292 }
3293
3294 if (DstSize == 64 && Pred == SCC_TRUE) {
3296 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3297 .addReg(TrueReg)
3298 .addReg(FalseReg);
3299
3300 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3301 return;
3302 }
3303
3304 static const int16_t Sub0_15[] = {
3305 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3306 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3307 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3308 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3309 };
3310
3311 static const int16_t Sub0_15_64[] = {
3312 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3313 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3314 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3315 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3316 };
3317
3318 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3319 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3320 const int16_t *SubIndices = Sub0_15;
3321 int NElts = DstSize / 32;
3322
3323 // 64-bit select is only available for SALU.
3324 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3325 if (Pred == SCC_TRUE) {
3326 if (NElts % 2) {
3327 SelOp = AMDGPU::S_CSELECT_B32;
3328 EltRC = &AMDGPU::SGPR_32RegClass;
3329 } else {
3330 SelOp = AMDGPU::S_CSELECT_B64;
3331 EltRC = &AMDGPU::SGPR_64RegClass;
3332 SubIndices = Sub0_15_64;
3333 NElts /= 2;
3334 }
3335 }
3336
3338 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3339
3340 I = MIB->getIterator();
3341
3343 for (int Idx = 0; Idx != NElts; ++Idx) {
3344 Register DstElt = MRI.createVirtualRegister(EltRC);
3345 Regs.push_back(DstElt);
3346
3347 unsigned SubIdx = SubIndices[Idx];
3348
3350 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3351 Select =
3352 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3353 .addReg(FalseReg, 0, SubIdx)
3354 .addReg(TrueReg, 0, SubIdx);
3355 } else {
3356 Select =
3357 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3358 .addReg(TrueReg, 0, SubIdx)
3359 .addReg(FalseReg, 0, SubIdx);
3360 }
3361
3362 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3364
3365 MIB.addReg(DstElt)
3366 .addImm(SubIdx);
3367 }
3368}
3369
3371 switch (MI.getOpcode()) {
3372 case AMDGPU::V_MOV_B16_t16_e32:
3373 case AMDGPU::V_MOV_B16_t16_e64:
3374 case AMDGPU::V_MOV_B32_e32:
3375 case AMDGPU::V_MOV_B32_e64:
3376 case AMDGPU::V_MOV_B64_PSEUDO:
3377 case AMDGPU::V_MOV_B64_e32:
3378 case AMDGPU::V_MOV_B64_e64:
3379 case AMDGPU::S_MOV_B32:
3380 case AMDGPU::S_MOV_B64:
3381 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3382 case AMDGPU::COPY:
3383 case AMDGPU::WWM_COPY:
3384 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3385 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3386 case AMDGPU::V_ACCVGPR_MOV_B32:
3387 return true;
3388 default:
3389 return false;
3390 }
3391}
3392
3393static constexpr unsigned ModifierOpNames[] = {
3394 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3395 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3396 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3397
3399 unsigned Opc = MI.getOpcode();
3400 for (unsigned Name : reverse(ModifierOpNames)) {
3402 if (Idx >= 0)
3403 MI.removeOperand(Idx);
3404 }
3405}
3406
3408 Register Reg, MachineRegisterInfo *MRI) const {
3409 if (!MRI->hasOneNonDBGUse(Reg))
3410 return false;
3411
3412 switch (DefMI.getOpcode()) {
3413 default:
3414 return false;
3415 case AMDGPU::V_MOV_B64_e32:
3416 case AMDGPU::S_MOV_B64:
3417 case AMDGPU::V_MOV_B64_PSEUDO:
3418 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3419 case AMDGPU::V_MOV_B32_e32:
3420 case AMDGPU::S_MOV_B32:
3421 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3422 break;
3423 }
3424
3425 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3426 assert(ImmOp);
3427 // FIXME: We could handle FrameIndex values here.
3428 if (!ImmOp->isImm())
3429 return false;
3430
3431 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3432 int64_t Imm = ImmOp->getImm();
3433 switch (UseOp.getSubReg()) {
3434 default:
3435 return Imm;
3436 case AMDGPU::sub0:
3437 return Lo_32(Imm);
3438 case AMDGPU::sub1:
3439 return Hi_32(Imm);
3440 case AMDGPU::lo16:
3441 return APInt(16, Imm).getSExtValue();
3442 case AMDGPU::hi16:
3443 return APInt(32, Imm).ashr(16).getSExtValue();
3444 case AMDGPU::sub1_lo16:
3445 return APInt(16, Hi_32(Imm)).getSExtValue();
3446 case AMDGPU::sub1_hi16:
3447 return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
3448 }
3449 };
3450
3451 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3452
3453 unsigned Opc = UseMI.getOpcode();
3454 if (Opc == AMDGPU::COPY) {
3455 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3456
3457 Register DstReg = UseMI.getOperand(0).getReg();
3458 unsigned OpSize = getOpSize(UseMI, 0);
3459 bool Is16Bit = OpSize == 2;
3460 bool Is64Bit = OpSize == 8;
3461 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3462 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3463 : AMDGPU::V_MOV_B32_e32
3464 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3465 : AMDGPU::S_MOV_B32;
3466 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
3467
3468 if (RI.isAGPR(*MRI, DstReg)) {
3469 if (Is64Bit || !isInlineConstant(Imm))
3470 return false;
3471 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3472 }
3473
3474 if (Is16Bit) {
3475 if (isVGPRCopy)
3476 return false; // Do not clobber vgpr_hi16
3477
3478 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3479 return false;
3480
3481 UseMI.getOperand(0).setSubReg(0);
3482 if (DstReg.isPhysical()) {
3483 DstReg = RI.get32BitRegister(DstReg);
3484 UseMI.getOperand(0).setReg(DstReg);
3485 }
3486 assert(UseMI.getOperand(1).getReg().isVirtual());
3487 }
3488
3489 const MCInstrDesc &NewMCID = get(NewOpc);
3490 if (DstReg.isPhysical() &&
3491 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3492 return false;
3493
3494 UseMI.setDesc(NewMCID);
3495 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3496 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3497 return true;
3498 }
3499
3500 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3501 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3502 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3503 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3504 Opc == AMDGPU::V_FMAC_F16_t16_e64) {
3505 // Don't fold if we are using source or output modifiers. The new VOP2
3506 // instructions don't have them.
3508 return false;
3509
3510 // If this is a free constant, there's no reason to do this.
3511 // TODO: We could fold this here instead of letting SIFoldOperands do it
3512 // later.
3513 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3514
3515 // Any src operand can be used for the legality check.
3516 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3517 return false;
3518
3519 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3520 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3521 bool IsFMA =
3522 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3523 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3524 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3525 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3526 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3527
3528 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3529 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3530 (Src1->isReg() && Src1->getReg() == Reg)) {
3531 MachineOperand *RegSrc =
3532 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3533 if (!RegSrc->isReg())
3534 return false;
3535 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3536 ST.getConstantBusLimit(Opc) < 2)
3537 return false;
3538
3539 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3540 return false;
3541
3542 // If src2 is also a literal constant then we have to choose which one to
3543 // fold. In general it is better to choose madak so that the other literal
3544 // can be materialized in an sgpr instead of a vgpr:
3545 // s_mov_b32 s0, literal
3546 // v_madak_f32 v0, s0, v0, literal
3547 // Instead of:
3548 // v_mov_b32 v1, literal
3549 // v_madmk_f32 v0, v0, literal, v1
3550 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3551 if (Def && Def->isMoveImmediate() &&
3552 !isInlineConstant(Def->getOperand(1)))
3553 return false;
3554
3555 unsigned NewOpc =
3556 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3557 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3558 : AMDGPU::V_FMAMK_F16)
3559 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3560 if (pseudoToMCOpcode(NewOpc) == -1)
3561 return false;
3562
3563 // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3564 // would also require restricting their register classes. For now
3565 // just bail out.
3566 if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
3567 return false;
3568
3569 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3570
3571 // FIXME: This would be a lot easier if we could return a new instruction
3572 // instead of having to modify in place.
3573
3574 Register SrcReg = RegSrc->getReg();
3575 unsigned SrcSubReg = RegSrc->getSubReg();
3576 Src0->setReg(SrcReg);
3577 Src0->setSubReg(SrcSubReg);
3578 Src0->setIsKill(RegSrc->isKill());
3579
3580 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3581 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3582 Opc == AMDGPU::V_FMAC_F16_e64)
3583 UseMI.untieRegOperand(
3584 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3585
3586 Src1->ChangeToImmediate(Imm);
3587
3589 UseMI.setDesc(get(NewOpc));
3590
3591 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3592 if (DeleteDef)
3593 DefMI.eraseFromParent();
3594
3595 return true;
3596 }
3597
3598 // Added part is the constant: Use v_madak_{f16, f32}.
3599 if (Src2->isReg() && Src2->getReg() == Reg) {
3600 if (ST.getConstantBusLimit(Opc) < 2) {
3601 // Not allowed to use constant bus for another operand.
3602 // We can however allow an inline immediate as src0.
3603 bool Src0Inlined = false;
3604 if (Src0->isReg()) {
3605 // Try to inline constant if possible.
3606 // If the Def moves immediate and the use is single
3607 // We are saving VGPR here.
3608 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3609 if (Def && Def->isMoveImmediate() &&
3610 isInlineConstant(Def->getOperand(1)) &&
3611 MRI->hasOneUse(Src0->getReg())) {
3612 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3613 Src0Inlined = true;
3614 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3615 RI.isSGPRReg(*MRI, Src0->getReg())) {
3616 return false;
3617 }
3618 // VGPR is okay as Src0 - fallthrough
3619 }
3620
3621 if (Src1->isReg() && !Src0Inlined) {
3622 // We have one slot for inlinable constant so far - try to fill it
3623 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3624 if (Def && Def->isMoveImmediate() &&
3625 isInlineConstant(Def->getOperand(1)) &&
3626 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3627 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3628 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3629 return false;
3630 // VGPR is okay as Src1 - fallthrough
3631 }
3632 }
3633
3634 unsigned NewOpc =
3635 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3636 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3637 : AMDGPU::V_FMAAK_F16)
3638 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3639 if (pseudoToMCOpcode(NewOpc) == -1)
3640 return false;
3641
3642 // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3643 // would also require restricting their register classes. For now
3644 // just bail out.
3645 if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
3646 return false;
3647
3648 // FIXME: This would be a lot easier if we could return a new instruction
3649 // instead of having to modify in place.
3650
3651 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3652 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3653 Opc == AMDGPU::V_FMAC_F16_e64)
3654 UseMI.untieRegOperand(
3655 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3656
3657 // ChangingToImmediate adds Src2 back to the instruction.
3658 Src2->ChangeToImmediate(getImmFor(*Src2));
3659
3660 // These come before src2.
3662 UseMI.setDesc(get(NewOpc));
3663 // It might happen that UseMI was commuted
3664 // and we now have SGPR as SRC1. If so 2 inlined
3665 // constant and SGPR are illegal.
3667
3668 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3669 if (DeleteDef)
3670 DefMI.eraseFromParent();
3671
3672 return true;
3673 }
3674 }
3675
3676 return false;
3677}
3678
3679static bool
3682 if (BaseOps1.size() != BaseOps2.size())
3683 return false;
3684 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3685 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3686 return false;
3687 }
3688 return true;
3689}
3690
3691static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3692 LocationSize WidthB, int OffsetB) {
3693 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3694 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3695 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3696 return LowWidth.hasValue() &&
3697 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3698}
3699
3700bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3701 const MachineInstr &MIb) const {
3702 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3703 int64_t Offset0, Offset1;
3704 LocationSize Dummy0 = 0, Dummy1 = 0;
3705 bool Offset0IsScalable, Offset1IsScalable;
3706 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3707 Dummy0, &RI) ||
3708 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3709 Dummy1, &RI))
3710 return false;
3711
3712 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3713 return false;
3714
3715 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3716 // FIXME: Handle ds_read2 / ds_write2.
3717 return false;
3718 }
3719 LocationSize Width0 = MIa.memoperands().front()->getSize();
3720 LocationSize Width1 = MIb.memoperands().front()->getSize();
3721 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3722}
3723
3725 const MachineInstr &MIb) const {
3726 assert(MIa.mayLoadOrStore() &&
3727 "MIa must load from or modify a memory location");
3728 assert(MIb.mayLoadOrStore() &&
3729 "MIb must load from or modify a memory location");
3730
3732 return false;
3733
3734 // XXX - Can we relax this between address spaces?
3735 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3736 return false;
3737
3738 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3739 return false;
3740
3741 // TODO: Should we check the address space from the MachineMemOperand? That
3742 // would allow us to distinguish objects we know don't alias based on the
3743 // underlying address space, even if it was lowered to a different one,
3744 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3745 // buffer.
3746 if (isDS(MIa)) {
3747 if (isDS(MIb))
3748 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3749
3750 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3751 }
3752
3753 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3754 if (isMUBUF(MIb) || isMTBUF(MIb))
3755 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3756
3757 if (isFLAT(MIb))
3758 return isFLATScratch(MIb);
3759
3760 return !isSMRD(MIb);
3761 }
3762
3763 if (isSMRD(MIa)) {
3764 if (isSMRD(MIb))
3765 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3766
3767 if (isFLAT(MIb))
3768 return isFLATScratch(MIb);
3769
3770 return !isMUBUF(MIb) && !isMTBUF(MIb);
3771 }
3772
3773 if (isFLAT(MIa)) {
3774 if (isFLAT(MIb)) {
3775 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3776 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3777 return true;
3778
3779 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3780 }
3781
3782 return false;
3783 }
3784
3785 return false;
3786}
3787
3789 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3790 if (Reg.isPhysical())
3791 return false;
3792 auto *Def = MRI.getUniqueVRegDef(Reg);
3793 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3794 Imm = Def->getOperand(1).getImm();
3795 if (DefMI)
3796 *DefMI = Def;
3797 return true;
3798 }
3799 return false;
3800}
3801
3802static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3803 MachineInstr **DefMI = nullptr) {
3804 if (!MO->isReg())
3805 return false;
3806 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3807 const MachineRegisterInfo &MRI = MF->getRegInfo();
3808 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3809}
3810
3812 MachineInstr &NewMI) {
3813 if (LV) {
3814 unsigned NumOps = MI.getNumOperands();
3815 for (unsigned I = 1; I < NumOps; ++I) {
3816 MachineOperand &Op = MI.getOperand(I);
3817 if (Op.isReg() && Op.isKill())
3818 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3819 }
3820 }
3821}
3822
3824 LiveVariables *LV,
3825 LiveIntervals *LIS) const {
3826 MachineBasicBlock &MBB = *MI.getParent();
3827 unsigned Opc = MI.getOpcode();
3828
3829 // Handle MFMA.
3830 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3831 if (NewMFMAOpc != -1) {
3833 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3834 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3835 MIB.add(MI.getOperand(I));
3836 updateLiveVariables(LV, MI, *MIB);
3837 if (LIS) {
3838 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3839 // SlotIndex of defs needs to be updated when converting to early-clobber
3840 MachineOperand &Def = MIB->getOperand(0);
3841 if (Def.isEarlyClobber() && Def.isReg() &&
3842 LIS->hasInterval(Def.getReg())) {
3843 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3844 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3845 auto &LI = LIS->getInterval(Def.getReg());
3846 auto UpdateDefIndex = [&](LiveRange &LR) {
3847 auto S = LR.find(OldIndex);
3848 if (S != LR.end() && S->start == OldIndex) {
3849 assert(S->valno && S->valno->def == OldIndex);
3850 S->start = NewIndex;
3851 S->valno->def = NewIndex;
3852 }
3853 };
3854 UpdateDefIndex(LI);
3855 for (auto &SR : LI.subranges())
3856 UpdateDefIndex(SR);
3857 }
3858 }
3859 return MIB;
3860 }
3861
3862 if (SIInstrInfo::isWMMA(MI)) {
3863 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3864 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3865 .setMIFlags(MI.getFlags());
3866 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3867 MIB->addOperand(MI.getOperand(I));
3868
3869 updateLiveVariables(LV, MI, *MIB);
3870 if (LIS)
3871 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3872
3873 return MIB;
3874 }
3875
3876 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3877 "V_FMAC_F16_t16_e32 is not supported and not expected to be present "
3878 "pre-RA");
3879
3880 // Handle MAC/FMAC.
3881 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3882 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3883 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3884 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3885 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3886 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3887 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3888 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3889 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3890 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3891 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3892 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3893 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3894 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3895 bool Src0Literal = false;
3896
3897 switch (Opc) {
3898 default:
3899 return nullptr;
3900 case AMDGPU::V_MAC_F16_e64:
3901 case AMDGPU::V_FMAC_F16_e64:
3902 case AMDGPU::V_FMAC_F16_t16_e64:
3903 case AMDGPU::V_MAC_F32_e64:
3904 case AMDGPU::V_MAC_LEGACY_F32_e64:
3905 case AMDGPU::V_FMAC_F32_e64:
3906 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3907 case AMDGPU::V_FMAC_F64_e64:
3908 break;
3909 case AMDGPU::V_MAC_F16_e32:
3910 case AMDGPU::V_FMAC_F16_e32:
3911 case AMDGPU::V_MAC_F32_e32:
3912 case AMDGPU::V_MAC_LEGACY_F32_e32:
3913 case AMDGPU::V_FMAC_F32_e32:
3914 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3915 case AMDGPU::V_FMAC_F64_e32: {
3916 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3917 AMDGPU::OpName::src0);
3918 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3919 if (!Src0->isReg() && !Src0->isImm())
3920 return nullptr;
3921
3922 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3923 Src0Literal = true;
3924
3925 break;
3926 }
3927 }
3928
3930 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3931 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3932 const MachineOperand *Src0Mods =
3933 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3934 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3935 const MachineOperand *Src1Mods =
3936 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3937 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3938 const MachineOperand *Src2Mods =
3939 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3940 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3941 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3942 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3943
3944 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3945 !IsLegacy &&
3946 // If we have an SGPR input, we will violate the constant bus restriction.
3947 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3948 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3950 const auto killDef = [&]() -> void {
3952 // The only user is the instruction which will be killed.
3953 Register DefReg = DefMI->getOperand(0).getReg();
3954 if (!MRI.hasOneNonDBGUse(DefReg))
3955 return;
3956 // We cannot just remove the DefMI here, calling pass will crash.
3957 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3958 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3960 if (LV)
3961 LV->getVarInfo(DefReg).AliveBlocks.clear();
3962 };
3963
3964 int64_t Imm;
3965 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3966 unsigned NewOpc =
3967 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3968 : AMDGPU::V_FMAAK_F16)
3969 : AMDGPU::V_FMAAK_F32)
3970 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3971 if (pseudoToMCOpcode(NewOpc) != -1) {
3972 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3973 .add(*Dst)
3974 .add(*Src0)
3975 .add(*Src1)
3976 .addImm(Imm)
3977 .setMIFlags(MI.getFlags());
3978 updateLiveVariables(LV, MI, *MIB);
3979 if (LIS)
3980 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3981 killDef();
3982 return MIB;
3983 }
3984 }
3985 unsigned NewOpc =
3986 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3987 : AMDGPU::V_FMAMK_F16)
3988 : AMDGPU::V_FMAMK_F32)
3989 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3990 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3991 if (pseudoToMCOpcode(NewOpc) != -1) {
3992 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3993 .add(*Dst)
3994 .add(*Src0)
3995 .addImm(Imm)
3996 .add(*Src2)
3997 .setMIFlags(MI.getFlags());
3998 updateLiveVariables(LV, MI, *MIB);
3999 if (LIS)
4000 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4001 killDef();
4002 return MIB;
4003 }
4004 }
4005 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4006 if (Src0Literal) {
4007 Imm = Src0->getImm();
4008 DefMI = nullptr;
4009 }
4010 if (pseudoToMCOpcode(NewOpc) != -1 &&
4012 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4013 Src1)) {
4014 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4015 .add(*Dst)
4016 .add(*Src1)
4017 .addImm(Imm)
4018 .add(*Src2)
4019 .setMIFlags(MI.getFlags());
4020 updateLiveVariables(LV, MI, *MIB);
4021 if (LIS)
4022 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4023 if (DefMI)
4024 killDef();
4025 return MIB;
4026 }
4027 }
4028 }
4029
4030 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4031 // if VOP3 does not allow a literal operand.
4032 if (Src0Literal && !ST.hasVOP3Literal())
4033 return nullptr;
4034
4035 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
4036 : IsF64 ? AMDGPU::V_FMA_F64_e64
4037 : IsLegacy
4038 ? AMDGPU::V_FMA_LEGACY_F32_e64
4039 : AMDGPU::V_FMA_F32_e64
4040 : IsF16 ? AMDGPU::V_MAD_F16_e64
4041 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
4042 : AMDGPU::V_MAD_F32_e64;
4043 if (pseudoToMCOpcode(NewOpc) == -1)
4044 return nullptr;
4045
4046 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4047 .add(*Dst)
4048 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4049 .add(*Src0)
4050 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4051 .add(*Src1)
4052 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4053 .add(*Src2)
4054 .addImm(Clamp ? Clamp->getImm() : 0)
4055 .addImm(Omod ? Omod->getImm() : 0)
4056 .setMIFlags(MI.getFlags());
4057 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4058 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4059 updateLiveVariables(LV, MI, *MIB);
4060 if (LIS)
4061 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4062 return MIB;
4063}
4064
4065// It's not generally safe to move VALU instructions across these since it will
4066// start using the register as a base index rather than directly.
4067// XXX - Why isn't hasSideEffects sufficient for these?
4069 switch (MI.getOpcode()) {
4070 case AMDGPU::S_SET_GPR_IDX_ON:
4071 case AMDGPU::S_SET_GPR_IDX_MODE:
4072 case AMDGPU::S_SET_GPR_IDX_OFF:
4073 return true;
4074 default:
4075 return false;
4076 }
4077}
4078
4080 const MachineBasicBlock *MBB,
4081 const MachineFunction &MF) const {
4082 // Skipping the check for SP writes in the base implementation. The reason it
4083 // was added was apparently due to compile time concerns.
4084 //
4085 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4086 // but is probably avoidable.
4087
4088 // Copied from base implementation.
4089 // Terminators and labels can't be scheduled around.
4090 if (MI.isTerminator() || MI.isPosition())
4091 return true;
4092
4093 // INLINEASM_BR can jump to another block
4094 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4095 return true;
4096
4097 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4098 return true;
4099
4100 // Target-independent instructions do not have an implicit-use of EXEC, even
4101 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4102 // boundaries prevents incorrect movements of such instructions.
4103 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4104 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4105 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4106 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4108}
4109
4111 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4112}
4113
4115 // Skip the full operand and register alias search modifiesRegister
4116 // does. There's only a handful of instructions that touch this, it's only an
4117 // implicit def, and doesn't alias any other registers.
4118 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4119}
4120
4122 unsigned Opcode = MI.getOpcode();
4123
4124 if (MI.mayStore() && isSMRD(MI))
4125 return true; // scalar store or atomic
4126
4127 // This will terminate the function when other lanes may need to continue.
4128 if (MI.isReturn())
4129 return true;
4130
4131 // These instructions cause shader I/O that may cause hardware lockups
4132 // when executed with an empty EXEC mask.
4133 //
4134 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4135 // EXEC = 0, but checking for that case here seems not worth it
4136 // given the typical code patterns.
4137 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4138 isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
4139 Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
4140 return true;
4141
4142 if (MI.isCall() || MI.isInlineAsm())
4143 return true; // conservative assumption
4144
4145 // Assume that barrier interactions are only intended with active lanes.
4146 if (isBarrier(Opcode))
4147 return true;
4148
4149 // A mode change is a scalar operation that influences vector instructions.
4151 return true;
4152
4153 // These are like SALU instructions in terms of effects, so it's questionable
4154 // whether we should return true for those.
4155 //
4156 // However, executing them with EXEC = 0 causes them to operate on undefined
4157 // data, which we avoid by returning true here.
4158 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4159 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4160 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4161 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4162 return true;
4163
4164 return false;
4165}
4166
4168 const MachineInstr &MI) const {
4169 if (MI.isMetaInstruction())
4170 return false;
4171
4172 // This won't read exec if this is an SGPR->SGPR copy.
4173 if (MI.isCopyLike()) {
4174 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4175 return true;
4176
4177 // Make sure this isn't copying exec as a normal operand
4178 return MI.readsRegister(AMDGPU::EXEC, &RI);
4179 }
4180
4181 // Make a conservative assumption about the callee.
4182 if (MI.isCall())
4183 return true;
4184
4185 // Be conservative with any unhandled generic opcodes.
4186 if (!isTargetSpecificOpcode(MI.getOpcode()))
4187 return true;
4188
4189 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4190}
4191
4192bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4193 switch (Imm.getBitWidth()) {
4194 case 1: // This likely will be a condition code mask.
4195 return true;
4196
4197 case 32:
4198 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4199 ST.hasInv2PiInlineImm());
4200 case 64:
4201 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4202 ST.hasInv2PiInlineImm());
4203 case 16:
4204 return ST.has16BitInsts() &&
4205 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4206 ST.hasInv2PiInlineImm());
4207 default:
4208 llvm_unreachable("invalid bitwidth");
4209 }
4210}
4211
4213 APInt IntImm = Imm.bitcastToAPInt();
4214 int64_t IntImmVal = IntImm.getSExtValue();
4215 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4216 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4217 default:
4218 llvm_unreachable("invalid fltSemantics");
4221 return isInlineConstant(IntImm);
4223 return ST.has16BitInsts() &&
4224 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4226 return ST.has16BitInsts() &&
4227 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4228 }
4229}
4230
4232 uint8_t OperandType) const {
4233 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4234 if (!MO.isImm())
4235 return false;
4236
4237 // MachineOperand provides no way to tell the true operand size, since it only
4238 // records a 64-bit value. We need to know the size to determine if a 32-bit
4239 // floating point immediate bit pattern is legal for an integer immediate. It
4240 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4241
4242 int64_t Imm = MO.getImm();
4243 switch (OperandType) {
4256 int32_t Trunc = static_cast<int32_t>(Imm);
4258 }
4265 ST.hasInv2PiInlineImm());
4269 // We would expect inline immediates to not be concerned with an integer/fp
4270 // distinction. However, in the case of 16-bit integer operations, the
4271 // "floating point" values appear to not work. It seems read the low 16-bits
4272 // of 32-bit immediates, which happens to always work for the integer
4273 // values.
4274 //
4275 // See llvm bugzilla 46302.
4276 //
4277 // TODO: Theoretically we could use op-sel to use the high bits of the
4278 // 32-bit FP values.
4296 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4297 // A few special case instructions have 16-bit operands on subtargets
4298 // where 16-bit instructions are not legal.
4299 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4300 // constants in these cases
4301 int16_t Trunc = static_cast<int16_t>(Imm);
4302 return ST.has16BitInsts() &&
4304 }
4305
4306 return false;
4307 }
4312 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4313 int16_t Trunc = static_cast<int16_t>(Imm);
4314 return ST.has16BitInsts() &&
4316 }
4317 return false;
4318 }
4321 return false;
4324 // Always embedded in the instruction for free.
4325 return true;
4335 // Just ignore anything else.
4336 return true;
4337 default:
4338 llvm_unreachable("invalid operand type");
4339 }
4340}
4341
4342static bool compareMachineOp(const MachineOperand &Op0,
4343 const MachineOperand &Op1) {
4344 if (Op0.getType() != Op1.getType())
4345 return false;
4346
4347 switch (Op0.getType()) {
4349 return Op0.getReg() == Op1.getReg();
4351 return Op0.getImm() == Op1.getImm();
4352 default:
4353 llvm_unreachable("Didn't expect to be comparing these operand types");
4354 }
4355}
4356
4358 const MachineOperand &MO) const {
4359 const MCInstrDesc &InstDesc = MI.getDesc();
4360 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4361
4362 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4363
4365 return true;
4366
4367 if (OpInfo.RegClass < 0)
4368 return false;
4369
4370 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4371 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4372 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4373 AMDGPU::OpName::src2))
4374 return false;
4375 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4376 }
4377
4378 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4379 return false;
4380
4381 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4382 return true;
4383
4384 return ST.hasVOP3Literal();
4385}
4386
4387bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4388 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4389 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4390 return false;
4391
4392 int Op32 = AMDGPU::getVOPe32(Opcode);
4393 if (Op32 == -1)
4394 return false;
4395
4396 return pseudoToMCOpcode(Op32) != -1;
4397}
4398
4399bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4400 // The src0_modifier operand is present on all instructions
4401 // that have modifiers.
4402
4403 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4404}
4405
4407 unsigned OpName) const {
4408 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4409 return Mods && Mods->getImm();
4410}
4411
4413 return any_of(ModifierOpNames,
4414 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4415}
4416
4418 const MachineRegisterInfo &MRI) const {
4419 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4420 // Can't shrink instruction with three operands.
4421 if (Src2) {
4422 switch (MI.getOpcode()) {
4423 default: return false;
4424
4425 case AMDGPU::V_ADDC_U32_e64:
4426 case AMDGPU::V_SUBB_U32_e64:
4427 case AMDGPU::V_SUBBREV_U32_e64: {
4428 const MachineOperand *Src1
4429 = getNamedOperand(MI, AMDGPU::OpName::src1);
4430 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4431 return false;
4432 // Additional verification is needed for sdst/src2.
4433 return true;
4434 }
4435 case AMDGPU::V_MAC_F16_e64:
4436 case AMDGPU::V_MAC_F32_e64:
4437 case AMDGPU::V_MAC_LEGACY_F32_e64:
4438 case AMDGPU::V_FMAC_F16_e64:
4439 case AMDGPU::V_FMAC_F16_t16_e64:
4440 case AMDGPU::V_FMAC_F32_e64:
4441 case AMDGPU::V_FMAC_F64_e64:
4442 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4443 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4444 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4445 return false;
4446 break;
4447
4448 case AMDGPU::V_CNDMASK_B32_e64:
4449 break;
4450 }
4451 }
4452
4453 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4454 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4455 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4456 return false;
4457
4458 // We don't need to check src0, all input types are legal, so just make sure
4459 // src0 isn't using any modifiers.
4460 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4461 return false;
4462
4463 // Can it be shrunk to a valid 32 bit opcode?
4464 if (!hasVALU32BitEncoding(MI.getOpcode()))
4465 return false;
4466
4467 // Check output modifiers
4468 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4469 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4470 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel);
4471}
4472
4473// Set VCC operand with all flags from \p Orig, except for setting it as
4474// implicit.
4476 const MachineOperand &Orig) {
4477
4478 for (MachineOperand &Use : MI.implicit_operands()) {
4479 if (Use.isUse() &&
4480 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4481 Use.setIsUndef(Orig.isUndef());
4482 Use.setIsKill(Orig.isKill());
4483 return;
4484 }
4485 }
4486}
4487
4489 unsigned Op32) const {
4490 MachineBasicBlock *MBB = MI.getParent();
4491
4492 const MCInstrDesc &Op32Desc = get(Op32);
4493 MachineInstrBuilder Inst32 =
4494 BuildMI(*MBB, MI, MI.getDebugLoc(), Op32Desc)
4495 .setMIFlags(MI.getFlags());
4496
4497 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4498 // For VOPC instructions, this is replaced by an implicit def of vcc.
4499
4500 // We assume the defs of the shrunk opcode are in the same order, and the
4501 // shrunk opcode loses the last def (SGPR def, in the VOP3->VOPC case).
4502 for (int I = 0, E = Op32Desc.getNumDefs(); I != E; ++I)
4503 Inst32.add(MI.getOperand(I));
4504
4505 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4506
4507 int Idx = MI.getNumExplicitDefs();
4508 for (const MachineOperand &Use : MI.explicit_uses()) {
4509 int OpTy = MI.getDesc().operands()[Idx++].OperandType;
4511 continue;
4512
4513 if (&Use == Src2) {
4514 if (AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2) == -1) {
4515 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4516 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4517 // of vcc was already added during the initial BuildMI, but we
4518 // 1) may need to change vcc to vcc_lo to preserve the original register
4519 // 2) have to preserve the original flags.
4520 fixImplicitOperands(*Inst32);
4521 copyFlagsToImplicitVCC(*Inst32, *Src2);
4522 continue;
4523 }
4524 }
4525
4526 Inst32.add(Use);
4527 }
4528
4529 // FIXME: Losing implicit operands
4530
4531 return Inst32;
4532}
4533
4535 const MachineOperand &MO,
4536 const MCOperandInfo &OpInfo) const {
4537 // Literal constants use the constant bus.
4538 if (!MO.isReg())
4539 return !isInlineConstant(MO, OpInfo);
4540
4541 if (!MO.isUse())
4542 return false;
4543
4544 if (MO.getReg().isVirtual())
4545 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4546
4547 // Null is free
4548 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4549 return false;
4550
4551 // SGPRs use the constant bus
4552 if (MO.isImplicit()) {
4553 return MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
4554 MO.getReg() == AMDGPU::VCC_LO;
4555 }
4556 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4557 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4558}
4559
4561 for (const MachineOperand &MO : MI.implicit_operands()) {
4562 // We only care about reads.
4563 if (MO.isDef())
4564 continue;
4565
4566 switch (MO.getReg()) {
4567 case AMDGPU::VCC:
4568 case AMDGPU::VCC_LO:
4569 case AMDGPU::VCC_HI:
4570 case AMDGPU::M0:
4571 case AMDGPU::FLAT_SCR:
4572 return MO.getReg();
4573
4574 default:
4575 break;
4576 }
4577 }
4578
4579 return Register();
4580}
4581
4582static bool shouldReadExec(const MachineInstr &MI) {
4583 if (SIInstrInfo::isVALU(MI)) {
4584 switch (MI.getOpcode()) {
4585 case AMDGPU::V_READLANE_B32:
4586 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4587 case AMDGPU::V_WRITELANE_B32:
4588 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4589 return false;
4590 }
4591
4592 return true;
4593 }
4594
4595 if (MI.isPreISelOpcode() ||
4596 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4599 return false;
4600
4601 return true;
4602}
4603
4604static bool isRegOrFI(const MachineOperand &MO) {
4605 return MO.isReg() || MO.isFI();
4606}
4607
4608static bool isSubRegOf(const SIRegisterInfo &TRI,
4609 const MachineOperand &SuperVec,
4610 const MachineOperand &SubReg) {
4611 if (SubReg.getReg().isPhysical())
4612 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4613
4614 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4615 SubReg.getReg() == SuperVec.getReg();
4616}
4617
4619 StringRef &ErrInfo) const {
4620 uint16_t Opcode = MI.getOpcode();
4621 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4622 return true;
4623
4624 const MachineFunction *MF = MI.getParent()->getParent();
4625 const MachineRegisterInfo &MRI = MF->getRegInfo();
4626
4627 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4628 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4629 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4630 int Src3Idx = -1;
4631 if (Src0Idx == -1) {
4632 // VOPD V_DUAL_* instructions use different operand names.
4633 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4634 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4635 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4636 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4637 }
4638
4639 // Make sure the number of operands is correct.
4640 const MCInstrDesc &Desc = get(Opcode);
4641 if (!Desc.isVariadic() &&
4642 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4643 ErrInfo = "Instruction has wrong number of operands.";
4644 return false;
4645 }
4646
4647 if (MI.isInlineAsm()) {
4648 // Verify register classes for inlineasm constraints.
4649 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4650 I != E; ++I) {
4651 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4652 if (!RC)
4653 continue;
4654
4655 const MachineOperand &Op = MI.getOperand(I);
4656 if (!Op.isReg())
4657 continue;
4658
4659 Register Reg = Op.getReg();
4660 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4661 ErrInfo = "inlineasm operand has incorrect register class.";
4662 return false;
4663 }
4664 }
4665
4666 return true;
4667 }
4668
4669 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4670 ErrInfo = "missing memory operand from image instruction.";
4671 return false;
4672 }
4673
4674 // Make sure the register classes are correct.
4675 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4676 const MachineOperand &MO = MI.getOperand(i);
4677 if (MO.isFPImm()) {
4678 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4679 "all fp values to integers.";
4680 return false;
4681 }
4682
4683 int RegClass = Desc.operands()[i].RegClass;
4684
4685 switch (Desc.operands()[i].OperandType) {
4687 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4688 ErrInfo = "Illegal immediate value for operand.";
4689 return false;
4690 }
4691 break;
4696 break;
4708 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4709 ErrInfo = "Illegal immediate value for operand.";
4710 return false;
4711 }
4712 break;
4713 }
4715 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4716 ErrInfo = "Expected inline constant for operand.";
4717 return false;
4718 }
4719 break;
4722 // Check if this operand is an immediate.
4723 // FrameIndex operands will be replaced by immediates, so they are
4724 // allowed.
4725 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4726 ErrInfo = "Expected immediate, but got non-immediate";
4727 return false;
4728 }
4729 [[fallthrough]];
4730 default:
4731 continue;
4732 }
4733
4734 if (!MO.isReg())
4735 continue;
4736 Register Reg = MO.getReg();
4737 if (!Reg)
4738 continue;
4739
4740 // FIXME: Ideally we would have separate instruction definitions with the
4741 // aligned register constraint.
4742 // FIXME: We do not verify inline asm operands, but custom inline asm
4743 // verification is broken anyway
4744 if (ST.needsAlignedVGPRs()) {
4745 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4746 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4747 const TargetRegisterClass *SubRC =
4748 RI.getSubRegisterClass(RC, MO.getSubReg());
4749 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4750 if (RC)
4751 RC = SubRC;
4752 }
4753
4754 // Check that this is the aligned version of the class.
4755 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4756 ErrInfo = "Subtarget requires even aligned vector registers";
4757 return false;
4758 }
4759 }
4760
4761 if (RegClass != -1) {
4762 if (Reg.isVirtual())
4763 continue;
4764
4765 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4766 if (!RC->contains(Reg)) {
4767 ErrInfo = "Operand has incorrect register class.";
4768 return false;
4769 }
4770 }
4771 }
4772
4773 // Verify SDWA
4774 if (isSDWA(MI)) {
4775 if (!ST.hasSDWA()) {
4776 ErrInfo = "SDWA is not supported on this target";
4777 return false;
4778 }
4779
4780 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4781
4782 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4783 if (OpIdx == -1)
4784 continue;
4785 const MachineOperand &MO = MI.getOperand(OpIdx);
4786
4787 if (!ST.hasSDWAScalar()) {
4788 // Only VGPRS on VI
4789 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4790 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4791 return false;
4792 }
4793 } else {
4794 // No immediates on GFX9
4795 if (!MO.isReg()) {
4796 ErrInfo =
4797 "Only reg allowed as operands in SDWA instructions on GFX9+";
4798 return false;
4799 }
4800 }
4801 }
4802
4803 if (!ST.hasSDWAOmod()) {
4804 // No omod allowed on VI
4805 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4806 if (OMod != nullptr &&
4807 (!OMod->isImm() || OMod->getImm() != 0)) {
4808 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4809 return false;
4810 }
4811 }
4812
4813 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4814 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4815 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4816 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4817 const MachineOperand *Src0ModsMO =
4818 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4819 unsigned Mods = Src0ModsMO->getImm();
4820 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4821 Mods & SISrcMods::SEXT) {
4822 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4823 return false;
4824 }
4825 }
4826
4827 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4828 if (isVOPC(BasicOpcode)) {
4829 if (!ST.hasSDWASdst() && DstIdx != -1) {
4830 // Only vcc allowed as dst on VI for VOPC
4831 const MachineOperand &Dst = MI.getOperand(DstIdx);
4832 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4833 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4834 return false;
4835 }
4836 } else if (!ST.hasSDWAOutModsVOPC()) {
4837 // No clamp allowed on GFX9 for VOPC
4838 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4839 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4840 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4841 return false;
4842 }
4843
4844 // No omod allowed on GFX9 for VOPC
4845 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4846 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4847 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4848 return false;
4849 }
4850 }
4851 }
4852
4853 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4854 if (DstUnused && DstUnused->isImm() &&
4855 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4856 const MachineOperand &Dst = MI.getOperand(DstIdx);
4857 if (!Dst.isReg() || !Dst.isTied()) {
4858 ErrInfo = "Dst register should have tied register";
4859 return false;
4860 }
4861
4862 const MachineOperand &TiedMO =
4863 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4864 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4865 ErrInfo =
4866 "Dst register should be tied to implicit use of preserved register";
4867 return false;
4868 }
4869 if (TiedMO.getReg().isPhysical() && Dst.getReg() != TiedMO.getReg()) {
4870 ErrInfo = "Dst register should use same physical register as preserved";
4871 return false;
4872 }
4873 }
4874 }
4875
4876 // Verify MIMG / VIMAGE / VSAMPLE
4877 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4878 // Ensure that the return type used is large enough for all the options
4879 // being used TFE/LWE require an extra result register.
4880 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4881 if (DMask) {
4882 uint64_t DMaskImm = DMask->getImm();
4883 uint32_t RegCount =
4884 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4885 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4886 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4887 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4888
4889 // Adjust for packed 16 bit values
4890 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4891 RegCount = divideCeil(RegCount, 2);
4892
4893 // Adjust if using LWE or TFE
4894 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4895 RegCount += 1;
4896
4897 const uint32_t DstIdx =
4898 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4899 const MachineOperand &Dst = MI.getOperand(DstIdx);
4900 if (Dst.isReg()) {
4901 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4902 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4903 if (RegCount > DstSize) {
4904 ErrInfo = "Image instruction returns too many registers for dst "
4905 "register class";
4906 return false;
4907 }
4908 }
4909 }
4910 }
4911
4912 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4913 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4914 unsigned ConstantBusCount = 0;
4915 bool UsesLiteral = false;
4916 const MachineOperand *LiteralVal = nullptr;
4917
4918 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4919 if (ImmIdx != -1) {
4920 ++ConstantBusCount;
4921 UsesLiteral = true;
4922 LiteralVal = &MI.getOperand(ImmIdx);
4923 }
4924
4925 SmallVector<Register, 2> SGPRsUsed;
4926 Register SGPRUsed;
4927
4928 // Only look at the true operands. Only a real operand can use the constant
4929 // bus, and we don't want to check pseudo-operands like the source modifier
4930 // flags.
4931 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4932 if (OpIdx == -1)
4933 continue;
4934 const MachineOperand &MO = MI.getOperand(OpIdx);
4935 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4936 if (MO.isReg()) {
4937 SGPRUsed = MO.getReg();
4938 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4939 ++ConstantBusCount;
4940 SGPRsUsed.push_back(SGPRUsed);
4941 }
4942 } else if (!MO.isFI()) { // Treat FI like a register.
4943 if (!UsesLiteral) {
4944 ++ConstantBusCount;
4945 UsesLiteral = true;
4946 LiteralVal = &MO;
4947 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4948 assert(isVOP2(MI) || isVOP3(MI));
4949 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4950 return false;
4951 }
4952 }
4953 }
4954 }
4955
4956 SGPRUsed = findImplicitSGPRRead(MI);
4957 if (SGPRUsed) {
4958 // Implicit uses may safely overlap true operands
4959 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4960 return !RI.regsOverlap(SGPRUsed, SGPR);
4961 })) {
4962 ++ConstantBusCount;
4963 SGPRsUsed.push_back(SGPRUsed);
4964 }
4965 }
4966
4967 // v_writelane_b32 is an exception from constant bus restriction:
4968 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4969 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4970 Opcode != AMDGPU::V_WRITELANE_B32) {
4971 ErrInfo = "VOP* instruction violates constant bus restriction";
4972 return false;
4973 }
4974
4975 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4976 ErrInfo = "VOP3 instruction uses literal";
4977 return false;
4978 }
4979 }
4980
4981 // Special case for writelane - this can break the multiple constant bus rule,
4982 // but still can't use more than one SGPR register
4983 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4984 unsigned SGPRCount = 0;
4985 Register SGPRUsed;
4986
4987 for (int OpIdx : {Src0Idx, Src1Idx}) {
4988 if (OpIdx == -1)
4989 break;
4990
4991 const MachineOperand &MO = MI.getOperand(OpIdx);
4992
4993 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4994 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4995 if (MO.getReg() != SGPRUsed)
4996 ++SGPRCount;
4997 SGPRUsed = MO.getReg();
4998 }
4999 }
5000 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
5001 ErrInfo = "WRITELANE instruction violates constant bus restriction";
5002 return false;
5003 }
5004 }
5005 }
5006
5007 // Verify misc. restrictions on specific instructions.
5008 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
5009 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
5010 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5011 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5012 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5013 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5014 if (!compareMachineOp(Src0, Src1) &&
5015 !compareMachineOp(Src0, Src2)) {
5016 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5017 return false;
5018 }
5019 }
5020 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5021 SISrcMods::ABS) ||
5022 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5023 SISrcMods::ABS) ||
5024 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5025 SISrcMods::ABS)) {
5026 ErrInfo = "ABS not allowed in VOP3B instructions";
5027 return false;
5028 }
5029 }
5030
5031 if (isSOP2(MI) || isSOPC(MI)) {
5032 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5033 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5034
5035 if (!isRegOrFI(Src0) && !isRegOrFI(Src1) &&
5036 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5037 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5038 !Src0.isIdenticalTo(Src1)) {
5039 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5040 return false;
5041 }
5042 }
5043
5044 if (isSOPK(MI)) {
5045 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5046 if (Desc.isBranch()) {
5047 if (!Op->isMBB()) {
5048 ErrInfo = "invalid branch target for SOPK instruction";
5049 return false;
5050 }
5051 } else {
5052 uint64_t Imm = Op->getImm();
5053 if (sopkIsZext(Opcode)) {
5054 if (!isUInt<16>(Imm)) {
5055 ErrInfo = "invalid immediate for SOPK instruction";
5056 return false;
5057 }
5058 } else {
5059 if (!isInt<16>(Imm)) {
5060 ErrInfo = "invalid immediate for SOPK instruction";
5061 return false;
5062 }
5063 }
5064 }
5065 }
5066
5067 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5068 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5069 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5070 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5071 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5072 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5073
5074 const unsigned StaticNumOps =
5075 Desc.getNumOperands() + Desc.implicit_uses().size();
5076 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5077
5078 // Allow additional implicit operands. This allows a fixup done by the post
5079 // RA scheduler where the main implicit operand is killed and implicit-defs
5080 // are added for sub-registers that remain live after this instruction.
5081 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5082 ErrInfo = "missing implicit register operands";
5083 return false;
5084 }
5085
5086 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5087 if (IsDst) {
5088 if (!Dst->isUse()) {
5089 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5090 return false;
5091 }
5092
5093 unsigned UseOpIdx;
5094 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5095 UseOpIdx != StaticNumOps + 1) {
5096 ErrInfo = "movrel implicit operands should be tied";
5097 return false;
5098 }
5099 }
5100
5101 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5102 const MachineOperand &ImpUse
5103 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5104 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5105 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5106 ErrInfo = "src0 should be subreg of implicit vector use";
5107 return false;
5108 }
5109 }
5110
5111 // Make sure we aren't losing exec uses in the td files. This mostly requires
5112 // being careful when using let Uses to try to add other use registers.
5113 if (shouldReadExec(MI)) {
5114 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5115 ErrInfo = "VALU instruction does not implicitly read exec mask";
5116 return false;
5117 }
5118 }
5119
5120 if (isSMRD(MI)) {
5121 if (MI.mayStore() &&
5123 // The register offset form of scalar stores may only use m0 as the
5124 // soffset register.
5125 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5126 if (Soff && Soff->getReg() != AMDGPU::M0) {
5127 ErrInfo = "scalar stores must use m0 as offset register";
5128 return false;
5129 }
5130 }
5131 }
5132
5133 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5134 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5135 if (Offset->getImm() != 0) {
5136 ErrInfo = "subtarget does not support offsets in flat instructions";
5137 return false;
5138 }
5139 }
5140
5141 if (isDS(MI) && !ST.hasGDS()) {
5142 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5143 if (GDSOp && GDSOp->getImm() != 0) {
5144 ErrInfo = "GDS is not supported on this subtarget";
5145 return false;
5146 }
5147 }
5148
5149 if (isImage(MI)) {
5150 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5151 if (DimOp) {
5152 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5153 AMDGPU::OpName::vaddr0);
5154 int RSrcOpName =
5155 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5156 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5157 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5158 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5160 const AMDGPU::MIMGDimInfo *Dim =
5162
5163 if (!Dim) {
5164 ErrInfo = "dim is out of range";
5165 return false;
5166 }
5167
5168 bool IsA16 = false;
5169 if (ST.hasR128A16()) {
5170 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5171 IsA16 = R128A16->getImm() != 0;
5172 } else if (ST.hasA16()) {
5173 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5174 IsA16 = A16->getImm() != 0;
5175 }
5176
5177 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5178
5179 unsigned AddrWords =
5180 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5181
5182 unsigned VAddrWords;
5183 if (IsNSA) {
5184 VAddrWords = RsrcIdx - VAddr0Idx;
5185 if (ST.hasPartialNSAEncoding() &&
5186 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5187 unsigned LastVAddrIdx = RsrcIdx - 1;
5188 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5189 }
5190 } else {
5191 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5192 if (AddrWords > 12)
5193 AddrWords = 16;
5194 }
5195
5196 if (VAddrWords != AddrWords) {
5197 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5198 << " but got " << VAddrWords << "\n");
5199 ErrInfo = "bad vaddr size";
5200 return false;
5201 }
5202 }
5203 }
5204
5205 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5206 if (DppCt) {
5207 using namespace AMDGPU::DPP;
5208
5209 unsigned DC = DppCt->getImm();
5210 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5211 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5212 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5213 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5214 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5215 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5216 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5217 ErrInfo = "Invalid dpp_ctrl value";
5218 return false;
5219 }
5220 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5222 ErrInfo = "Invalid dpp_ctrl value: "
5223 "wavefront shifts are not supported on GFX10+";
5224 return false;
5225 }
5226 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5228 ErrInfo = "Invalid dpp_ctrl value: "
5229 "broadcasts are not supported on GFX10+";
5230 return false;
5231 }
5232 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5234 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5235 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5236 !ST.hasGFX90AInsts()) {
5237 ErrInfo = "Invalid dpp_ctrl value: "
5238 "row_newbroadcast/row_share is not supported before "
5239 "GFX90A/GFX10";
5240 return false;
5241 }
5242 if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5243 ErrInfo = "Invalid dpp_ctrl value: "
5244 "row_share and row_xmask are not supported before GFX10";
5245 return false;
5246 }
5247 }
5248
5249 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5251 ErrInfo = "Invalid dpp_ctrl value: "
5252 "DP ALU dpp only support row_newbcast";
5253 return false;
5254 }
5255 }
5256
5257 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5258 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5259 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5260 : AMDGPU::OpName::vdata;
5261 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5262 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5263 if (Data && !Data->isReg())
5264 Data = nullptr;
5265
5266 if (ST.hasGFX90AInsts()) {
5267 if (Dst && Data &&
5268 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5269 ErrInfo = "Invalid register class: "
5270 "vdata and vdst should be both VGPR or AGPR";
5271 return false;
5272 }
5273 if (Data && Data2 &&
5274 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5275 ErrInfo = "Invalid register class: "
5276 "both data operands should be VGPR or AGPR";
5277 return false;
5278 }
5279 } else {
5280 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5281 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5282 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5283 ErrInfo = "Invalid register class: "
5284 "agpr loads and stores not supported on this GPU";
5285 return false;
5286 }
5287 }
5288 }
5289
5290 if (ST.needsAlignedVGPRs()) {
5291 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5293 if (!Op)
5294 return true;
5295 Register Reg = Op->getReg();
5296 if (Reg.isPhysical())
5297 return !(RI.getHWRegIndex(Reg) & 1);
5298 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5299 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5300 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5301 };
5302
5303 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5304 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5305 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5306
5307 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5308 ErrInfo = "Subtarget requires even aligned vector registers "
5309 "for DS_GWS instructions";
5310 return false;
5311 }
5312 }
5313
5314 if (isMIMG(MI)) {
5315 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5316 ErrInfo = "Subtarget requires even aligned vector registers "
5317 "for vaddr operand of image instructions";
5318 return false;
5319 }
5320 }
5321 }
5322
5323 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5324 !ST.hasGFX90AInsts()) {
5325 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5326 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5327 ErrInfo = "Invalid register class: "
5328 "v_accvgpr_write with an SGPR is not supported on this GPU";
5329 return false;
5330 }
5331 }
5332
5333 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5334 const MachineOperand &SrcOp = MI.getOperand(1);
5335 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5336 ErrInfo = "pseudo expects only physical SGPRs";
5337 return false;
5338 }
5339 }
5340
5341 return true;
5342}
5343
5344// It is more readable to list mapped opcodes on the same line.
5345// clang-format off
5346
5348 switch (MI.getOpcode()) {
5349 default: return AMDGPU::INSTRUCTION_LIST_END;
5350 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5351 case AMDGPU::COPY: return AMDGPU::COPY;
5352 case AMDGPU::PHI: return AMDGPU::PHI;
5353 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5354 case AMDGPU::WQM: return AMDGPU::WQM;
5355 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5356 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5357 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5358 case AMDGPU::S_MOV_B32: {
5359 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5360 return MI.getOperand(1).isReg() ||
5361 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5362 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5363 }
5364 case AMDGPU::S_ADD_I32:
5365 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5366 case AMDGPU::S_ADDC_U32:
5367 return AMDGPU::V_ADDC_U32_e32;
5368 case AMDGPU::S_SUB_I32:
5369 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5370 // FIXME: These are not consistently handled, and selected when the carry is
5371 // used.
5372 case AMDGPU::S_ADD_U32:
5373 return AMDGPU::V_ADD_CO_U32_e32;
5374 case AMDGPU::S_SUB_U32:
5375 return AMDGPU::V_SUB_CO_U32_e32;
5376 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5377 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5378 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5379 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5380 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5381 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5382 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5383 case AMDGPU::S_XNOR_B32:
5384 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5385 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5386 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5387 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5388 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5389 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5390 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5391 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5392 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5393 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5394 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5395 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5396 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5397 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5398 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5399 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5400 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5401 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5402 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5403 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5404 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5405 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5406 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5407 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5408 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5409 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5410 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5411 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5412 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5413 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5414 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5415 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5416 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5417 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5418 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5419 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5420 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5421 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5422 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5423 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5424 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5425 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5426 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5427 case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5428 case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5429 case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
5430 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5431 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5432 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5433 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5434 case AMDGPU::S_CEIL_F16:
5435 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5436 : AMDGPU::V_CEIL_F16_fake16_e64;
5437 case AMDGPU::S_FLOOR_F16:
5438 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5439 : AMDGPU::V_FLOOR_F16_fake16_e64;
5440 case AMDGPU::S_TRUNC_F16:
5441 return AMDGPU::V_TRUNC_F16_fake16_e64;
5442 case AMDGPU::S_RNDNE_F16:
5443 return AMDGPU::V_RNDNE_F16_fake16_e64;
5444 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5445 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5446 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5447 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5448 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5449 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5450 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5451 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5452 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5453 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5454 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5455 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5456 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5457 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5458 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5459 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5460 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
5461 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5462 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5463 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5464 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5465 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5466 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5467 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5468 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5469 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5470 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5471 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5472 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5473 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5474 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5475 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5476 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5477 case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
5478 case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
5479 case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
5480 case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
5481 case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
5482 case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
5483 case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
5484 case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
5485 case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
5486 case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
5487 case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
5488 case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
5489 case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
5490 case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
5491 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5492 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5493 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5494 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5495 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5496 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5497 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5498 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5499 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5500 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5501 }
5503 "Unexpected scalar opcode without corresponding vector one!");
5504}
5505
5506// clang-format on
5507
5511 const DebugLoc &DL, Register Reg,
5512 bool IsSCCLive,
5513 SlotIndexes *Indexes) const {
5514 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5515 const SIInstrInfo *TII = ST.getInstrInfo();
5516 bool IsWave32 = ST.isWave32();
5517 if (IsSCCLive) {
5518 // Insert two move instructions, one to save the original value of EXEC and
5519 // the other to turn on all bits in EXEC. This is required as we can't use
5520 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5521 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5522 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5523 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5524 .addReg(Exec, RegState::Kill);
5525 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5526 if (Indexes) {
5527 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5528 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5529 }
5530 } else {
5531 const unsigned OrSaveExec =
5532 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5533 auto SaveExec =
5534 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5535 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5536 if (Indexes)
5537 Indexes->insertMachineInstrInMaps(*SaveExec);
5538 }
5539}
5540
5543 const DebugLoc &DL, Register Reg,
5544 SlotIndexes *Indexes) const {
5545 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5546 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5547 auto ExecRestoreMI =
5548 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5549 if (Indexes)
5550 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5551}
5552
5553static const TargetRegisterClass *
5555 const MachineRegisterInfo &MRI,
5556 const MCInstrDesc &TID, unsigned RCID,
5557 bool IsAllocatable) {
5558 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5559 (((TID.mayLoad() || TID.mayStore()) &&
5560 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5562 switch (RCID) {
5563 case AMDGPU::AV_32RegClassID:
5564 RCID = AMDGPU::VGPR_32RegClassID;
5565 break;
5566 case AMDGPU::AV_64RegClassID:
5567 RCID = AMDGPU::VReg_64RegClassID;
5568 break;
5569 case AMDGPU::AV_96RegClassID:
5570 RCID = AMDGPU::VReg_96RegClassID;
5571 break;
5572 case AMDGPU::AV_128RegClassID:
5573 RCID = AMDGPU::VReg_128RegClassID;
5574 break;
5575 case AMDGPU::AV_160RegClassID:
5576 RCID = AMDGPU::VReg_160RegClassID;
5577 break;
5578 case AMDGPU::AV_512RegClassID:
5579 RCID = AMDGPU::VReg_512RegClassID;
5580 break;
5581 default:
5582 break;
5583 }
5584 }
5585
5586 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5587}
5588
5590 unsigned OpNum, const TargetRegisterInfo *TRI,
5591 const MachineFunction &MF)
5592 const {
5593 if (OpNum >= TID.getNumOperands())
5594 return nullptr;
5595 auto RegClass = TID.operands()[OpNum].RegClass;
5596 bool IsAllocatable = false;
5598 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5599 // with two data operands. Request register class constrained to VGPR only
5600 // of both operands present as Machine Copy Propagation can not check this
5601 // constraint and possibly other passes too.
5602 //
5603 // The check is limited to FLAT and DS because atomics in non-flat encoding
5604 // have their vdst and vdata tied to be the same register.
5605 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5606 AMDGPU::OpName::vdst);
5607 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5608 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5609 : AMDGPU::OpName::vdata);
5610 if (DataIdx != -1) {
5611 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5612 TID.Opcode, AMDGPU::OpName::data1);
5613 }
5614 }
5615 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5616 IsAllocatable);
5617}
5618
5620 unsigned OpNo) const {
5621 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5622 const MCInstrDesc &Desc = get(MI.getOpcode());
5623 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5624 Desc.operands()[OpNo].RegClass == -1) {
5625 Register Reg = MI.getOperand(OpNo).getReg();
5626
5627 if (Reg.isVirtual())
5628 return MRI.getRegClass(Reg);
5629 return RI.getPhysRegBaseClass(Reg);
5630 }
5631
5632 unsigned RCID = Desc.operands()[OpNo].RegClass;
5633 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5634}
5635
5638 MachineBasicBlock *MBB = MI.getParent();
5639 MachineOperand &MO = MI.getOperand(OpIdx);
5641 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5642 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5643 unsigned Size = RI.getRegSizeInBits(*RC);
5644 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO
5645 : Size == 16 ? AMDGPU::V_MOV_B16_t16_e64
5646 : AMDGPU::V_MOV_B32_e32;
5647 if (MO.isReg())
5648 Opcode = AMDGPU::COPY;
5649 else if (RI.isSGPRClass(RC))
5650 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5651
5652 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5653 Register Reg = MRI.createVirtualRegister(VRC);
5655 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5656 MO.ChangeToRegister(Reg, false);
5657}
5658
5661 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5662 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5663 MachineBasicBlock *MBB = MI->getParent();
5664 DebugLoc DL = MI->getDebugLoc();
5665 Register SubReg = MRI.createVirtualRegister(SubRC);
5666
5667 unsigned NewSubIdx = RI.composeSubRegIndices(SuperReg.getSubReg(), SubIdx);
5668 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5669 .addReg(SuperReg.getReg(), 0, NewSubIdx);
5670 return SubReg;
5671}
5672
5675 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5676 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5677 if (Op.isImm()) {
5678 if (SubIdx == AMDGPU::sub0)
5679 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5680 if (SubIdx == AMDGPU::sub1)
5681 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5682
5683 llvm_unreachable("Unhandled register index for immediate");
5684 }
5685
5686 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5687 SubIdx, SubRC);
5688 return MachineOperand::CreateReg(SubReg, false);
5689}
5690
5691// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5692void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5693 assert(Inst.getNumExplicitOperands() == 3);
5694 MachineOperand Op1 = Inst.getOperand(1);
5695 Inst.removeOperand(1);
5696 Inst.addOperand(Op1);
5697}
5698
5700 const MCOperandInfo &OpInfo,
5701 const MachineOperand &MO) const {
5702 if (!MO.isReg())
5703 return false;
5704
5705 Register Reg = MO.getReg();
5706
5707 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5708 if (Reg.isPhysical())
5709 return DRC->contains(Reg);
5710
5711 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5712
5713 if (MO.getSubReg()) {
5714 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5715 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5716 if (!SuperRC)
5717 return false;
5718
5719 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5720 if (!DRC)
5721 return false;
5722 }
5723 return RC->hasSuperClassEq(DRC);
5724}
5725
5727 const MCOperandInfo &OpInfo,
5728 const MachineOperand &MO) const {
5729 if (MO.isReg())
5730 return isLegalRegOperand(MRI, OpInfo, MO);
5731
5732 // Handle non-register types that are treated like immediates.
5733 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5734 return true;
5735}
5736
5737bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5738 const MachineOperand *MO) const {
5739 const MachineFunction &MF = *MI.getParent()->getParent();
5740 const MachineRegisterInfo &MRI = MF.getRegInfo();
5741 const MCInstrDesc &InstDesc = MI.getDesc();
5742 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5743 const TargetRegisterClass *DefinedRC =
5744 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5745 if (!MO)
5746 MO = &MI.getOperand(OpIdx);
5747
5748 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5749 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5750 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5751 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5752 return false;
5753
5755 if (MO->isReg())
5756 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5757
5758 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5759 if (i == OpIdx)
5760 continue;
5761 const MachineOperand &Op = MI.getOperand(i);
5762 if (Op.isReg()) {
5763 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5764 if (!SGPRsUsed.count(SGPR) &&
5765 // FIXME: This can access off the end of the operands() array.
5766 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5767 if (--ConstantBusLimit <= 0)
5768 return false;
5769 SGPRsUsed.insert(SGPR);
5770 }
5771 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5772 !isInlineConstant(Op, InstDesc.operands()[i])) {
5773 if (!LiteralLimit--)
5774 return false;
5775 if (--ConstantBusLimit <= 0)
5776 return false;
5777 }
5778 }
5779 } else if (ST.hasNoF16PseudoScalarTransInlineConstants() && !MO->isReg() &&
5780 isF16PseudoScalarTrans(MI.getOpcode()) &&
5781 isInlineConstant(*MO, OpInfo)) {
5782 return false;
5783 }
5784
5785 if (MO->isReg()) {
5786 if (!DefinedRC)
5787 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5788 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5789 return false;
5790 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5791 if (IsAGPR && !ST.hasMAIInsts())
5792 return false;
5793 unsigned Opc = MI.getOpcode();
5794 if (IsAGPR &&
5795 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5796 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5797 return false;
5798 // Atomics should have both vdst and vdata either vgpr or agpr.
5799 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5800 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5801 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5802 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5803 MI.getOperand(DataIdx).isReg() &&
5804 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5805 return false;
5806 if ((int)OpIdx == DataIdx) {
5807 if (VDstIdx != -1 &&
5808 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5809 return false;
5810 // DS instructions with 2 src operands also must have tied RC.
5811 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5812 AMDGPU::OpName::data1);
5813 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5814 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5815 return false;
5816 }
5817 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5818 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5819 RI.isSGPRReg(MRI, MO->getReg()))
5820 return false;
5821 return true;
5822 }
5823
5824 if (MO->isImm()) {
5825 uint64_t Imm = MO->getImm();
5826 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5827 bool Is64BitOp = Is64BitFPOp ||
5831 if (Is64BitOp &&
5833 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5834 return false;
5835
5836 // FIXME: We can use sign extended 64-bit literals, but only for signed
5837 // operands. At the moment we do not know if an operand is signed.
5838 // Such operand will be encoded as its low 32 bits and then either
5839 // correctly sign extended or incorrectly zero extended by HW.
5840 if (!Is64BitFPOp && (int32_t)Imm < 0)
5841 return false;
5842 }
5843 }
5844
5845 // Handle non-register types that are treated like immediates.
5846 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5847
5848 if (!DefinedRC) {
5849 // This operand expects an immediate.
5850 return true;
5851 }
5852
5853 return isImmOperandLegal(MI, OpIdx, *MO);
5854}
5855
5857 MachineInstr &MI) const {
5858 unsigned Opc = MI.getOpcode();
5859 const MCInstrDesc &InstrDesc = get(Opc);
5860
5861 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5862 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5863
5864 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5865 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5866
5867 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5868 // we need to only have one constant bus use before GFX10.
5869 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5870 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5871 RI.isSGPRReg(MRI, Src0.getReg()))
5872 legalizeOpWithMove(MI, Src0Idx);
5873
5874 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5875 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5876 // src0/src1 with V_READFIRSTLANE.
5877 if (Opc == AMDGPU::V_WRITELANE_B32) {
5878 const DebugLoc &DL = MI.getDebugLoc();
5879 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5880 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5881 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5882 .add(Src0);
5883 Src0.ChangeToRegister(Reg, false);
5884 }
5885 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5886 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5887 const DebugLoc &DL = MI.getDebugLoc();
5888 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5889 .add(Src1);
5890 Src1.ChangeToRegister(Reg, false);
5891 }
5892 return;
5893 }
5894
5895 // No VOP2 instructions support AGPRs.
5896 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5897 legalizeOpWithMove(MI, Src0Idx);
5898
5899 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5900 legalizeOpWithMove(MI, Src1Idx);
5901
5902 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5903 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5904 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5905 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5906 legalizeOpWithMove(MI, Src2Idx);
5907 }
5908
5909 // VOP2 src0 instructions support all operand types, so we don't need to check
5910 // their legality. If src1 is already legal, we don't need to do anything.
5911 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5912 return;
5913
5914 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5915 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5916 // select is uniform.
5917 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5918 RI.isVGPR(MRI, Src1.getReg())) {
5919 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5920 const DebugLoc &DL = MI.getDebugLoc();
5921 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5922 .add(Src1);
5923 Src1.ChangeToRegister(Reg, false);
5924 return;
5925 }
5926
5927 // We do not use commuteInstruction here because it is too aggressive and will
5928 // commute if it is possible. We only want to commute here if it improves
5929 // legality. This can be called a fairly large number of times so don't waste
5930 // compile time pointlessly swapping and checking legality again.
5931 if (HasImplicitSGPR || !MI.isCommutable()) {
5932 legalizeOpWithMove(MI, Src1Idx);
5933 return;
5934 }
5935
5936 // If src0 can be used as src1, commuting will make the operands legal.
5937 // Otherwise we have to give up and insert a move.
5938 //
5939 // TODO: Other immediate-like operand kinds could be commuted if there was a
5940 // MachineOperand::ChangeTo* for them.
5941 if ((!Src1.isImm() && !Src1.isReg()) ||
5942 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
5943 legalizeOpWithMove(MI, Src1Idx);
5944 return;
5945 }
5946
5947 int CommutedOpc = commuteOpcode(MI);
5948 if (CommutedOpc == -1) {
5949 legalizeOpWithMove(MI, Src1Idx);
5950 return;
5951 }
5952
5953 MI.setDesc(get(CommutedOpc));
5954
5955 Register Src0Reg = Src0.getReg();
5956 unsigned Src0SubReg = Src0.getSubReg();
5957 bool Src0Kill = Src0.isKill();
5958
5959 if (Src1.isImm())
5960 Src0.ChangeToImmediate(Src1.getImm());
5961 else if (Src1.isReg()) {
5962 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
5963 Src0.setSubReg(Src1.getSubReg());
5964 } else
5965 llvm_unreachable("Should only have register or immediate operands");
5966
5967 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
5968 Src1.setSubReg(Src0SubReg);
5970}
5971
5972// Legalize VOP3 operands. All operand types are supported for any operand
5973// but only one literal constant and only starting from GFX10.
5975 MachineInstr &MI) const {
5976 unsigned Opc = MI.getOpcode();
5977
5978 int VOP3Idx[3] = {
5979 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
5980 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
5981 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
5982 };
5983
5984 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
5985 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
5986 // src1 and src2 must be scalar
5987 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
5988 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
5989 const DebugLoc &DL = MI.getDebugLoc();
5990 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
5991 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5992 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5993 .add(Src1);
5994 Src1.ChangeToRegister(Reg, false);
5995 }
5996 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
5997 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5998 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5999 .add(Src2);
6000 Src2.ChangeToRegister(Reg, false);
6001 }
6002 }
6003
6004 // Find the one SGPR operand we are allowed to use.
6005 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6006 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6007 SmallDenseSet<unsigned> SGPRsUsed;
6008 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6009 if (SGPRReg) {
6010 SGPRsUsed.insert(SGPRReg);
6011 --ConstantBusLimit;
6012 }
6013
6014 for (int Idx : VOP3Idx) {
6015 if (Idx == -1)
6016 break;
6017 MachineOperand &MO = MI.getOperand(Idx);
6018
6019 if (!MO.isReg()) {
6020 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6021 continue;
6022
6023 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6024 --LiteralLimit;
6025 --ConstantBusLimit;
6026 continue;
6027 }
6028
6029 --LiteralLimit;
6030 --ConstantBusLimit;
6032 continue;
6033 }
6034
6035 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6036 !isOperandLegal(MI, Idx, &MO)) {
6038 continue;
6039 }
6040
6041 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6042 continue; // VGPRs are legal
6043
6044 // We can use one SGPR in each VOP3 instruction prior to GFX10
6045 // and two starting from GFX10.
6046 if (SGPRsUsed.count(MO.getReg()))
6047 continue;
6048 if (ConstantBusLimit > 0) {
6049 SGPRsUsed.insert(MO.getReg());
6050 --ConstantBusLimit;
6051 continue;
6052 }
6053
6054 // If we make it this far, then the operand is not legal and we must
6055 // legalize it.
6057 }
6058
6059 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6060 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6061 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6062 legalizeOpWithMove(MI, VOP3Idx[2]);
6063}
6064
6066 MachineRegisterInfo &MRI) const {
6067 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6068 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6069 Register DstReg = MRI.createVirtualRegister(SRC);
6070 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6071
6072 if (RI.hasAGPRs(VRC)) {
6073 VRC = RI.getEquivalentVGPRClass(VRC);
6074 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6075 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6076 get(TargetOpcode::COPY), NewSrcReg)
6077 .addReg(SrcReg);
6078 SrcReg = NewSrcReg;
6079 }
6080
6081 if (SubRegs == 1) {
6082 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6083 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6084 .addReg(SrcReg);
6085 return DstReg;
6086 }
6087
6089 for (unsigned i = 0; i < SubRegs; ++i) {
6090 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6091 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6092 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6093 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6094 SRegs.push_back(SGPR);
6095 }
6096
6098 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6099 get(AMDGPU::REG_SEQUENCE), DstReg);
6100 for (unsigned i = 0; i < SubRegs; ++i) {
6101 MIB.addReg(SRegs[i]);
6102 MIB.addImm(RI.getSubRegFromChannel(i));
6103 }
6104 return DstReg;
6105}
6106
6108 MachineInstr &MI) const {
6109
6110 // If the pointer is store in VGPRs, then we need to move them to
6111 // SGPRs using v_readfirstlane. This is safe because we only select
6112 // loads with uniform pointers to SMRD instruction so we know the
6113 // pointer value is uniform.
6114 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6115 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6116 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6117 SBase->setReg(SGPR);
6118 }
6119 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6120 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6121 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6122 SOff->setReg(SGPR);
6123 }
6124}
6125
6127 unsigned Opc = Inst.getOpcode();
6128 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6129 if (OldSAddrIdx < 0)
6130 return false;
6131
6133
6134 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6135 if (NewOpc < 0)
6137 if (NewOpc < 0)
6138 return false;
6139
6141 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6142 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6143 return false;
6144
6145 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6146 if (NewVAddrIdx < 0)
6147 return false;
6148
6149 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6150
6151 // Check vaddr, it shall be zero or absent.
6152 MachineInstr *VAddrDef = nullptr;
6153 if (OldVAddrIdx >= 0) {
6154 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6155 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6156 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6157 !VAddrDef->getOperand(1).isImm() ||
6158 VAddrDef->getOperand(1).getImm() != 0)
6159 return false;
6160 }
6161
6162 const MCInstrDesc &NewDesc = get(NewOpc);
6163 Inst.setDesc(NewDesc);
6164
6165 // Callers expect iterator to be valid after this call, so modify the
6166 // instruction in place.
6167 if (OldVAddrIdx == NewVAddrIdx) {
6168 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6169 // Clear use list from the old vaddr holding a zero register.
6170 MRI.removeRegOperandFromUseList(&NewVAddr);
6171 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6172 Inst.removeOperand(OldSAddrIdx);
6173 // Update the use list with the pointer we have just moved from vaddr to
6174 // saddr position. Otherwise new vaddr will be missing from the use list.
6175 MRI.removeRegOperandFromUseList(&NewVAddr);
6176 MRI.addRegOperandToUseList(&NewVAddr);
6177 } else {
6178 assert(OldSAddrIdx == NewVAddrIdx);
6179
6180 if (OldVAddrIdx >= 0) {
6181 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6182 AMDGPU::OpName::vdst_in);
6183
6184 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6185 // it asserts. Untie the operands for now and retie them afterwards.
6186 if (NewVDstIn != -1) {
6187 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6188 Inst.untieRegOperand(OldVDstIn);
6189 }
6190
6191 Inst.removeOperand(OldVAddrIdx);
6192
6193 if (NewVDstIn != -1) {
6194 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6195 Inst.tieOperands(NewVDst, NewVDstIn);
6196 }
6197 }
6198 }
6199
6200 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6201 VAddrDef->eraseFromParent();
6202
6203 return true;
6204}
6205
6206// FIXME: Remove this when SelectionDAG is obsoleted.
6208 MachineInstr &MI) const {
6210 return;
6211
6212 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6213 // thinks they are uniform, so a readfirstlane should be valid.
6214 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6215 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6216 return;
6217
6219 return;
6220
6221 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
6222 SAddr->setReg(ToSGPR);
6223}
6224
6227 const TargetRegisterClass *DstRC,
6230 const DebugLoc &DL) const {
6231 Register OpReg = Op.getReg();
6232 unsigned OpSubReg = Op.getSubReg();
6233
6234 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6235 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6236
6237 // Check if operand is already the correct register class.
6238 if (DstRC == OpRC)
6239 return;
6240
6241 Register DstReg = MRI.createVirtualRegister(DstRC);
6242 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
6243
6244 Op.setReg(DstReg);
6245 Op.setSubReg(0);
6246
6247 MachineInstr *Def = MRI.getVRegDef(OpReg);
6248 if (!Def)
6249 return;
6250
6251 // Try to eliminate the copy if it is copying an immediate value.
6252 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6253 foldImmediate(*Copy, *Def, OpReg, &MRI);
6254
6255 bool ImpDef = Def->isImplicitDef();
6256 while (!ImpDef && Def && Def->isCopy()) {
6257 if (Def->getOperand(1).getReg().isPhysical())
6258 break;
6259 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6260 ImpDef = Def && Def->isImplicitDef();
6261 }
6262 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6263 !ImpDef)
6264 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6265}
6266
6267// Emit the actual waterfall loop, executing the wrapped instruction for each
6268// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6269// iteration, in the worst case we execute 64 (once per lane).
6272 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
6273 ArrayRef<MachineOperand *> ScalarOps) {
6274 MachineFunction &MF = *OrigBB.getParent();
6275 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6276 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6277 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6278 unsigned SaveExecOpc =
6279 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6280 unsigned XorTermOpc =
6281 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6282 unsigned AndOpc =
6283 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6284 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6285
6287
6288 SmallVector<Register, 8> ReadlanePieces;
6289 Register CondReg;
6290
6291 for (MachineOperand *ScalarOp : ScalarOps) {
6292 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6293 unsigned NumSubRegs = RegSize / 32;
6294 Register VScalarOp = ScalarOp->getReg();
6295
6296 if (NumSubRegs == 1) {
6297 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6298
6299 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6300 .addReg(VScalarOp);
6301
6302 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6303
6304 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6305 .addReg(CurReg)
6306 .addReg(VScalarOp);
6307
6308 // Combine the comparison results with AND.
6309 if (!CondReg) // First.
6310 CondReg = NewCondReg;
6311 else { // If not the first, we create an AND.
6312 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6313 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6314 .addReg(CondReg)
6315 .addReg(NewCondReg);
6316 CondReg = AndReg;
6317 }
6318
6319 // Update ScalarOp operand to use the SGPR ScalarOp.
6320 ScalarOp->setReg(CurReg);
6321 ScalarOp->setIsKill();
6322 } else {
6323 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6324 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6325 "Unhandled register size");
6326
6327 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6328 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6329 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6330
6331 // Read the next variant <- also loop target.
6332 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6333 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6334
6335 // Read the next variant <- also loop target.
6336 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6337 .addReg(VScalarOp, VScalarOpUndef,
6338 TRI->getSubRegFromChannel(Idx + 1));
6339
6340 ReadlanePieces.push_back(CurRegLo);
6341 ReadlanePieces.push_back(CurRegHi);
6342
6343 // Comparison is to be done as 64-bit.
6344 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6345 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6346 .addReg(CurRegLo)
6347 .addImm(AMDGPU::sub0)
6348 .addReg(CurRegHi)
6349 .addImm(AMDGPU::sub1);
6350
6351 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6352 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6353 NewCondReg)
6354 .addReg(CurReg);
6355 if (NumSubRegs <= 2)
6356 Cmp.addReg(VScalarOp);
6357 else
6358 Cmp.addReg(VScalarOp, VScalarOpUndef,
6359 TRI->getSubRegFromChannel(Idx, 2));
6360
6361 // Combine the comparison results with AND.
6362 if (!CondReg) // First.
6363 CondReg = NewCondReg;
6364 else { // If not the first, we create an AND.
6365 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6366 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6367 .addReg(CondReg)
6368 .addReg(NewCondReg);
6369 CondReg = AndReg;
6370 }
6371 } // End for loop.
6372
6373 auto SScalarOpRC =
6374 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6375 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6376
6377 // Build scalar ScalarOp.
6378 auto Merge =
6379 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6380 unsigned Channel = 0;
6381 for (Register Piece : ReadlanePieces) {
6382 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6383 }
6384
6385 // Update ScalarOp operand to use the SGPR ScalarOp.
6386 ScalarOp->setReg(SScalarOp);
6387 ScalarOp->setIsKill();
6388 }
6389 }
6390
6391 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6392 MRI.setSimpleHint(SaveExec, CondReg);
6393
6394 // Update EXEC to matching lanes, saving original to SaveExec.
6395 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6396 .addReg(CondReg, RegState::Kill);
6397
6398 // The original instruction is here; we insert the terminators after it.
6399 I = BodyBB.end();
6400
6401 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6402 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6403 .addReg(Exec)
6404 .addReg(SaveExec);
6405
6406 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6407}
6408
6409// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6410// with SGPRs by iterating over all unique values across all lanes.
6411// Returns the loop basic block that now contains \p MI.
6412static MachineBasicBlock *
6416 MachineBasicBlock::iterator Begin = nullptr,
6417 MachineBasicBlock::iterator End = nullptr) {
6418 MachineBasicBlock &MBB = *MI.getParent();
6419 MachineFunction &MF = *MBB.getParent();
6420 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6421 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6423 if (!Begin.isValid())
6424 Begin = &MI;
6425 if (!End.isValid()) {
6426 End = &MI;
6427 ++End;
6428 }
6429 const DebugLoc &DL = MI.getDebugLoc();
6430 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6431 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6432 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6433
6434 // Save SCC. Waterfall Loop may overwrite SCC.
6435 Register SaveSCCReg;
6436
6437 // FIXME: We should maintain SCC liveness while doing the FixSGPRCopies walk
6438 // rather than unlimited scan everywhere
6439 bool SCCNotDead =
6440 MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI,
6441 std::numeric_limits<unsigned>::max()) !=
6443 if (SCCNotDead) {
6444 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6445 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6446 .addImm(1)
6447 .addImm(0);
6448 }
6449
6450 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6451
6452 // Save the EXEC mask
6453 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6454
6455 // Killed uses in the instruction we are waterfalling around will be
6456 // incorrect due to the added control-flow.
6458 ++AfterMI;
6459 for (auto I = Begin; I != AfterMI; I++) {
6460 for (auto &MO : I->all_uses())
6461 MRI.clearKillFlags(MO.getReg());
6462 }
6463
6464 // To insert the loop we need to split the block. Move everything after this
6465 // point to a new block, and insert a new empty block between the two.
6468 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6470 ++MBBI;
6471
6472 MF.insert(MBBI, LoopBB);
6473 MF.insert(MBBI, BodyBB);
6474 MF.insert(MBBI, RemainderBB);
6475
6476 LoopBB->addSuccessor(BodyBB);
6477 BodyBB->addSuccessor(LoopBB);
6478 BodyBB->addSuccessor(RemainderBB);
6479
6480 // Move Begin to MI to the BodyBB, and the remainder of the block to
6481 // RemainderBB.
6482 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6483 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6484 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6485
6486 MBB.addSuccessor(LoopBB);
6487
6488 // Update dominators. We know that MBB immediately dominates LoopBB, that
6489 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6490 // RemainderBB. RemainderBB immediately dominates all of the successors
6491 // transferred to it from MBB that MBB used to properly dominate.
6492 if (MDT) {
6493 MDT->addNewBlock(LoopBB, &MBB);
6494 MDT->addNewBlock(BodyBB, LoopBB);
6495 MDT->addNewBlock(RemainderBB, BodyBB);
6496 for (auto &Succ : RemainderBB->successors()) {
6497 if (MDT->properlyDominates(&MBB, Succ)) {
6498 MDT->changeImmediateDominator(Succ, RemainderBB);
6499 }
6500 }
6501 }
6502
6503 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
6504
6505 MachineBasicBlock::iterator First = RemainderBB->begin();
6506 // Restore SCC
6507 if (SCCNotDead) {
6508 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6509 .addReg(SaveSCCReg, RegState::Kill)
6510 .addImm(0);
6511 }
6512
6513 // Restore the EXEC mask
6514 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6515 return BodyBB;
6516}
6517
6518// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6519static std::tuple<unsigned, unsigned>
6521 MachineBasicBlock &MBB = *MI.getParent();
6522 MachineFunction &MF = *MBB.getParent();
6524
6525 // Extract the ptr from the resource descriptor.
6526 unsigned RsrcPtr =
6527 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6528 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6529
6530 // Create an empty resource descriptor
6531 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6532 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6533 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6534 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6535 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6536
6537 // Zero64 = 0
6538 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6539 .addImm(0);
6540
6541 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6542 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6543 .addImm(RsrcDataFormat & 0xFFFFFFFF);
6544
6545 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6546 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6547 .addImm(RsrcDataFormat >> 32);
6548
6549 // NewSRsrc = {Zero64, SRsrcFormat}
6550 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6551 .addReg(Zero64)
6552 .addImm(AMDGPU::sub0_sub1)
6553 .addReg(SRsrcFormatLo)
6554 .addImm(AMDGPU::sub2)
6555 .addReg(SRsrcFormatHi)
6556 .addImm(AMDGPU::sub3);
6557
6558 return std::tuple(RsrcPtr, NewSRsrc);
6559}
6560
6563 MachineDominatorTree *MDT) const {
6564 MachineFunction &MF = *MI.getParent()->getParent();
6566 MachineBasicBlock *CreatedBB = nullptr;
6567
6568 // Legalize VOP2
6569 if (isVOP2(MI) || isVOPC(MI)) {
6571 return CreatedBB;
6572 }
6573
6574 // Legalize VOP3
6575 if (isVOP3(MI)) {
6577 return CreatedBB;
6578 }
6579
6580 // Legalize SMRD
6581 if (isSMRD(MI)) {
6583 return CreatedBB;
6584 }
6585
6586 // Legalize FLAT
6587 if (isFLAT(MI)) {
6589 return CreatedBB;
6590 }
6591
6592 // Legalize REG_SEQUENCE and PHI
6593 // The register class of the operands much be the same type as the register
6594 // class of the output.
6595 if (MI.getOpcode() == AMDGPU::PHI) {
6596 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6597 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6598 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6599 continue;
6600 const TargetRegisterClass *OpRC =
6601 MRI.getRegClass(MI.getOperand(i).getReg());
6602 if (RI.hasVectorRegisters(OpRC)) {
6603 VRC = OpRC;
6604 } else {
6605 SRC = OpRC;
6606 }
6607 }
6608
6609 // If any of the operands are VGPR registers, then they all most be
6610 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6611 // them.
6612 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6613 if (!VRC) {
6614 assert(SRC);
6615 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6616 VRC = &AMDGPU::VReg_1RegClass;
6617 } else
6618 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6619 ? RI.getEquivalentAGPRClass(SRC)
6620 : RI.getEquivalentVGPRClass(SRC);
6621 } else {
6622 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6623 ? RI.getEquivalentAGPRClass(VRC)
6624 : RI.getEquivalentVGPRClass(VRC);
6625 }
6626 RC = VRC;
6627 } else {
6628 RC = SRC;
6629 }
6630
6631 // Update all the operands so they have the same type.
6632 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6633 MachineOperand &Op = MI.getOperand(I);
6634 if (!Op.isReg() || !Op.getReg().isVirtual())
6635 continue;
6636
6637 // MI is a PHI instruction.
6638 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6640
6641 // Avoid creating no-op copies with the same src and dst reg class. These
6642 // confuse some of the machine passes.
6643 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6644 }
6645 }
6646
6647 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6648 // VGPR dest type and SGPR sources, insert copies so all operands are
6649 // VGPRs. This seems to help operand folding / the register coalescer.
6650 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6651 MachineBasicBlock *MBB = MI.getParent();
6652 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6653 if (RI.hasVGPRs(DstRC)) {
6654 // Update all the operands so they are VGPR register classes. These may
6655 // not be the same register class because REG_SEQUENCE supports mixing
6656 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6657 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6658 MachineOperand &Op = MI.getOperand(I);
6659 if (!Op.isReg() || !Op.getReg().isVirtual())
6660 continue;
6661
6662 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6663 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6664 if (VRC == OpRC)
6665 continue;
6666
6667 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6668 Op.setIsKill();
6669 }
6670 }
6671
6672 return CreatedBB;
6673 }
6674
6675 // Legalize INSERT_SUBREG
6676 // src0 must have the same register class as dst
6677 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6678 Register Dst = MI.getOperand(0).getReg();
6679 Register Src0 = MI.getOperand(1).getReg();
6680 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6681 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6682 if (DstRC != Src0RC) {
6683 MachineBasicBlock *MBB = MI.getParent();
6684 MachineOperand &Op = MI.getOperand(1);
6685 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6686 }
6687 return CreatedBB;
6688 }
6689
6690 // Legalize SI_INIT_M0
6691 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6692 MachineOperand &Src = MI.getOperand(0);
6693 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6694 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6695 return CreatedBB;
6696 }
6697
6698 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6699 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6700 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6701 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6702 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6703 MI.getOpcode() == AMDGPU::S_WQM_B64 ||
6704 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32 ||
6705 MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U64) {
6706 MachineOperand &Src = MI.getOperand(1);
6707 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6708 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6709 return CreatedBB;
6710 }
6711
6712 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6713 //
6714 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6715 // scratch memory access. In both cases, the legalization never involves
6716 // conversion to the addr64 form.
6718 (isMUBUF(MI) || isMTBUF(MI)))) {
6719 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6720 : AMDGPU::OpName::srsrc;
6721 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6722 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6723 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6724
6725 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6726 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6727 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6728 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6729
6730 return CreatedBB;
6731 }
6732
6733 // Legalize SI_CALL
6734 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6735 MachineOperand *Dest = &MI.getOperand(0);
6736 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6737 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6738 // following copies, we also need to move copies from and to physical
6739 // registers into the loop block.
6740 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6741 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6742
6743 // Also move the copies to physical registers into the loop block
6744 MachineBasicBlock &MBB = *MI.getParent();
6746 while (Start->getOpcode() != FrameSetupOpcode)
6747 --Start;
6749 while (End->getOpcode() != FrameDestroyOpcode)
6750 ++End;
6751 // Also include following copies of the return value
6752 ++End;
6753 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6754 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6755 ++End;
6756 CreatedBB =
6757 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6758 }
6759 }
6760
6761 // Legalize s_sleep_var.
6762 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6763 const DebugLoc &DL = MI.getDebugLoc();
6764 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6765 int Src0Idx =
6766 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6767 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6768 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6769 .add(Src0);
6770 Src0.ChangeToRegister(Reg, false);
6771 return nullptr;
6772 }
6773
6774 // Legalize MUBUF instructions.
6775 bool isSoffsetLegal = true;
6776 int SoffsetIdx =
6777 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6778 if (SoffsetIdx != -1) {
6779 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6780 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6781 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6782 isSoffsetLegal = false;
6783 }
6784 }
6785
6786 bool isRsrcLegal = true;
6787 int RsrcIdx =
6788 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6789 if (RsrcIdx != -1) {
6790 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6791 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6792 isRsrcLegal = false;
6793 }
6794 }
6795
6796 // The operands are legal.
6797 if (isRsrcLegal && isSoffsetLegal)
6798 return CreatedBB;
6799
6800 if (!isRsrcLegal) {
6801 // Legalize a VGPR Rsrc
6802 //
6803 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6804 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6805 // a zero-value SRsrc.
6806 //
6807 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6808 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6809 // above.
6810 //
6811 // Otherwise we are on non-ADDR64 hardware, and/or we have
6812 // idxen/offen/bothen and we fall back to a waterfall loop.
6813
6814 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6815 MachineBasicBlock &MBB = *MI.getParent();
6816
6817 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6818 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6819 // This is already an ADDR64 instruction so we need to add the pointer
6820 // extracted from the resource descriptor to the current value of VAddr.
6821 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6822 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6823 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6824
6825 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6826 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6827 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6828
6829 unsigned RsrcPtr, NewSRsrc;
6830 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6831
6832 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6833 const DebugLoc &DL = MI.getDebugLoc();
6834 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6835 .addDef(CondReg0)
6836 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6837 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6838 .addImm(0);
6839
6840 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6841 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6842 .addDef(CondReg1, RegState::Dead)
6843 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6844 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6845 .addReg(CondReg0, RegState::Kill)
6846 .addImm(0);
6847
6848 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6849 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6850 .addReg(NewVAddrLo)
6851 .addImm(AMDGPU::sub0)
6852 .addReg(NewVAddrHi)
6853 .addImm(AMDGPU::sub1);
6854
6855 VAddr->setReg(NewVAddr);
6856 Rsrc->setReg(NewSRsrc);
6857 } else if (!VAddr && ST.hasAddr64()) {
6858 // This instructions is the _OFFSET variant, so we need to convert it to
6859 // ADDR64.
6861 "FIXME: Need to emit flat atomics here");
6862
6863 unsigned RsrcPtr, NewSRsrc;
6864 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6865
6866 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6867 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6868 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6869 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6870 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6871
6872 // Atomics with return have an additional tied operand and are
6873 // missing some of the special bits.
6874 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6875 MachineInstr *Addr64;
6876
6877 if (!VDataIn) {
6878 // Regular buffer load / store.
6880 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6881 .add(*VData)
6882 .addReg(NewVAddr)
6883 .addReg(NewSRsrc)
6884 .add(*SOffset)
6885 .add(*Offset);
6886
6887 if (const MachineOperand *CPol =
6888 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6889 MIB.addImm(CPol->getImm());
6890 }
6891
6892 if (const MachineOperand *TFE =
6893 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6894 MIB.addImm(TFE->getImm());
6895 }
6896
6897 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6898
6899 MIB.cloneMemRefs(MI);
6900 Addr64 = MIB;
6901 } else {
6902 // Atomics with return.
6903 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6904 .add(*VData)
6905 .add(*VDataIn)
6906 .addReg(NewVAddr)
6907 .addReg(NewSRsrc)
6908 .add(*SOffset)
6909 .add(*Offset)
6910 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6911 .cloneMemRefs(MI);
6912 }
6913
6914 MI.removeFromParent();
6915
6916 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6917 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6918 NewVAddr)
6919 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6920 .addImm(AMDGPU::sub0)
6921 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6922 .addImm(AMDGPU::sub1);
6923 } else {
6924 // Legalize a VGPR Rsrc and soffset together.
6925 if (!isSoffsetLegal) {
6926 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6927 CreatedBB =
6928 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
6929 return CreatedBB;
6930 }
6931 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
6932 return CreatedBB;
6933 }
6934 }
6935
6936 // Legalize a VGPR soffset.
6937 if (!isSoffsetLegal) {
6938 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6939 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
6940 return CreatedBB;
6941 }
6942 return CreatedBB;
6943}
6944
6946 InstrList.insert(MI);
6947 // Add MBUF instructiosn to deferred list.
6948 int RsrcIdx =
6949 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
6950 if (RsrcIdx != -1) {
6951 DeferredList.insert(MI);
6952 }
6953}
6954
6956 return DeferredList.contains(MI);
6957}
6958
6960 MachineDominatorTree *MDT) const {
6961
6962 while (!Worklist.empty()) {
6963 MachineInstr &Inst = *Worklist.top();
6964 Worklist.erase_top();
6965 // Skip MachineInstr in the deferred list.
6966 if (Worklist.isDeferred(&Inst))
6967 continue;
6968 moveToVALUImpl(Worklist, MDT, Inst);
6969 }
6970
6971 // Deferred list of instructions will be processed once
6972 // all the MachineInstr in the worklist are done.
6973 for (MachineInstr *Inst : Worklist.getDeferredList()) {
6974 moveToVALUImpl(Worklist, MDT, *Inst);
6975 assert(Worklist.empty() &&
6976 "Deferred MachineInstr are not supposed to re-populate worklist");
6977 }
6978}
6979
6982 MachineInstr &Inst) const {
6983
6985 if (!MBB)
6986 return;
6988 unsigned Opcode = Inst.getOpcode();
6989 unsigned NewOpcode = getVALUOp(Inst);
6990 // Handle some special cases
6991 switch (Opcode) {
6992 default:
6993 break;
6994 case AMDGPU::S_ADD_U64_PSEUDO:
6995 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
6996 break;
6997 case AMDGPU::S_SUB_U64_PSEUDO:
6998 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
6999 break;
7000 case AMDGPU::S_ADD_I32:
7001 case AMDGPU::S_SUB_I32: {
7002 // FIXME: The u32 versions currently selected use the carry.
7003 bool Changed;
7004 MachineBasicBlock *CreatedBBTmp = nullptr;
7005 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
7006 if (Changed)
7007 return;
7008
7009 // Default handling
7010 break;
7011 }
7012
7013 case AMDGPU::S_MUL_U64:
7014 // Split s_mul_u64 in 32-bit vector multiplications.
7015 splitScalarSMulU64(Worklist, Inst, MDT);
7016 Inst.eraseFromParent();
7017 return;
7018
7019 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7020 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7021 // This is a special case of s_mul_u64 where all the operands are either
7022 // zero extended or sign extended.
7023 splitScalarSMulPseudo(Worklist, Inst, MDT);
7024 Inst.eraseFromParent();
7025 return;
7026
7027 case AMDGPU::S_AND_B64:
7028 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7029 Inst.eraseFromParent();
7030 return;
7031
7032 case AMDGPU::S_OR_B64:
7033 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7034 Inst.eraseFromParent();
7035 return;
7036
7037 case AMDGPU::S_XOR_B64:
7038 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7039 Inst.eraseFromParent();
7040 return;
7041
7042 case AMDGPU::S_NAND_B64:
7043 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7044 Inst.eraseFromParent();
7045 return;
7046
7047 case AMDGPU::S_NOR_B64:
7048 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7049 Inst.eraseFromParent();
7050 return;
7051
7052 case AMDGPU::S_XNOR_B64:
7053 if (ST.hasDLInsts())
7054 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7055 else
7056 splitScalar64BitXnor(Worklist, Inst, MDT);
7057 Inst.eraseFromParent();
7058 return;
7059
7060 case AMDGPU::S_ANDN2_B64:
7061 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7062 Inst.eraseFromParent();
7063 return;
7064
7065 case AMDGPU::S_ORN2_B64:
7066 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7067 Inst.eraseFromParent();
7068 return;
7069
7070 case AMDGPU::S_BREV_B64:
7071 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7072 Inst.eraseFromParent();
7073 return;
7074
7075 case AMDGPU::S_NOT_B64:
7076 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7077 Inst.eraseFromParent();
7078 return;
7079
7080 case AMDGPU::S_BCNT1_I32_B64:
7081 splitScalar64BitBCNT(Worklist, Inst);
7082 Inst.eraseFromParent();
7083 return;
7084
7085 case AMDGPU::S_BFE_I64:
7086 splitScalar64BitBFE(Worklist, Inst);
7087 Inst.eraseFromParent();
7088 return;
7089
7090 case AMDGPU::S_FLBIT_I32_B64:
7091 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7092 Inst.eraseFromParent();
7093 return;
7094 case AMDGPU::S_FF1_I32_B64:
7095 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7096 Inst.eraseFromParent();
7097 return;
7098
7099 case AMDGPU::S_LSHL_B32:
7100 if (ST.hasOnlyRevVALUShifts()) {
7101 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7102 swapOperands(Inst);
7103 }
7104 break;
7105 case AMDGPU::S_ASHR_I32:
7106 if (ST.hasOnlyRevVALUShifts()) {
7107 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7108 swapOperands(Inst);
7109 }
7110 break;
7111 case AMDGPU::S_LSHR_B32:
7112 if (ST.hasOnlyRevVALUShifts()) {
7113 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7114 swapOperands(Inst);
7115 }
7116 break;
7117 case AMDGPU::S_LSHL_B64:
7118 if (ST.hasOnlyRevVALUShifts()) {
7119 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7120 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7121 : AMDGPU::V_LSHLREV_B64_e64;
7122 swapOperands(Inst);
7123 }
7124 break;
7125 case AMDGPU::S_ASHR_I64:
7126 if (ST.hasOnlyRevVALUShifts()) {
7127 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7128 swapOperands(Inst);
7129 }
7130 break;
7131 case AMDGPU::S_LSHR_B64:
7132 if (ST.hasOnlyRevVALUShifts()) {
7133 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7134 swapOperands(Inst);
7135 }
7136 break;
7137
7138 case AMDGPU::S_ABS_I32:
7139 lowerScalarAbs(Worklist, Inst);
7140 Inst.eraseFromParent();
7141 return;
7142
7143 case AMDGPU::S_CBRANCH_SCC0:
7144 case AMDGPU::S_CBRANCH_SCC1: {
7145 // Clear unused bits of vcc
7146 Register CondReg = Inst.getOperand(1).getReg();
7147 bool IsSCC = CondReg == AMDGPU::SCC;
7148 Register VCC = RI.getVCC();
7149 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7150 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7151 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7152 .addReg(EXEC)
7153 .addReg(IsSCC ? VCC : CondReg);
7154 Inst.removeOperand(1);
7155 } break;
7156
7157 case AMDGPU::S_BFE_U64:
7158 case AMDGPU::S_BFM_B64:
7159 llvm_unreachable("Moving this op to VALU not implemented");
7160
7161 case AMDGPU::S_PACK_LL_B32_B16:
7162 case AMDGPU::S_PACK_LH_B32_B16:
7163 case AMDGPU::S_PACK_HL_B32_B16:
7164 case AMDGPU::S_PACK_HH_B32_B16:
7165 movePackToVALU(Worklist, MRI, Inst);
7166 Inst.eraseFromParent();
7167 return;
7168
7169 case AMDGPU::S_XNOR_B32:
7170 lowerScalarXnor(Worklist, Inst);
7171 Inst.eraseFromParent();
7172 return;
7173
7174 case AMDGPU::S_NAND_B32:
7175 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7176 Inst.eraseFromParent();
7177 return;
7178
7179 case AMDGPU::S_NOR_B32:
7180 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7181 Inst.eraseFromParent();
7182 return;
7183
7184 case AMDGPU::S_ANDN2_B32:
7185 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7186 Inst.eraseFromParent();
7187 return;
7188
7189 case AMDGPU::S_ORN2_B32:
7190 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7191 Inst.eraseFromParent();
7192 return;
7193
7194 // TODO: remove as soon as everything is ready
7195 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7196 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7197 // can only be selected from the uniform SDNode.
7198 case AMDGPU::S_ADD_CO_PSEUDO:
7199 case AMDGPU::S_SUB_CO_PSEUDO: {
7200 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7201 ? AMDGPU::V_ADDC_U32_e64
7202 : AMDGPU::V_SUBB_U32_e64;
7203 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7204
7205 Register CarryInReg = Inst.getOperand(4).getReg();
7206 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7207 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7208 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7209 .addReg(CarryInReg);
7210 }
7211
7212 Register CarryOutReg = Inst.getOperand(1).getReg();
7213
7214 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7215 MRI.getRegClass(Inst.getOperand(0).getReg())));
7216 MachineInstr *CarryOp =
7217 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7218 .addReg(CarryOutReg, RegState::Define)
7219 .add(Inst.getOperand(2))
7220 .add(Inst.getOperand(3))
7221 .addReg(CarryInReg)
7222 .addImm(0);
7223 legalizeOperands(*CarryOp);
7224 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7225 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7226 Inst.eraseFromParent();
7227 }
7228 return;
7229 case AMDGPU::S_UADDO_PSEUDO:
7230 case AMDGPU::S_USUBO_PSEUDO: {
7231 const DebugLoc &DL = Inst.getDebugLoc();
7232 MachineOperand &Dest0 = Inst.getOperand(0);
7233 MachineOperand &Dest1 = Inst.getOperand(1);
7234 MachineOperand &Src0 = Inst.getOperand(2);
7235 MachineOperand &Src1 = Inst.getOperand(3);
7236
7237 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7238 ? AMDGPU::V_ADD_CO_U32_e64
7239 : AMDGPU::V_SUB_CO_U32_e64;
7240 const TargetRegisterClass *NewRC =
7241 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7242 Register DestReg = MRI.createVirtualRegister(NewRC);
7243 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7244 .addReg(Dest1.getReg(), RegState::Define)
7245 .add(Src0)
7246 .add(Src1)
7247 .addImm(0); // clamp bit
7248
7249 legalizeOperands(*NewInstr, MDT);
7250 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7251 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7252 Worklist);
7253 Inst.eraseFromParent();
7254 }
7255 return;
7256
7257 case AMDGPU::S_CSELECT_B32:
7258 case AMDGPU::S_CSELECT_B64:
7259 lowerSelect(Worklist, Inst, MDT);
7260 Inst.eraseFromParent();
7261 return;
7262 case AMDGPU::S_CMP_EQ_I32:
7263 case AMDGPU::S_CMP_LG_I32:
7264 case AMDGPU::S_CMP_GT_I32:
7265 case AMDGPU::S_CMP_GE_I32:
7266 case AMDGPU::S_CMP_LT_I32:
7267 case AMDGPU::S_CMP_LE_I32:
7268 case AMDGPU::S_CMP_EQ_U32:
7269 case AMDGPU::S_CMP_LG_U32:
7270 case AMDGPU::S_CMP_GT_U32:
7271 case AMDGPU::S_CMP_GE_U32:
7272 case AMDGPU::S_CMP_LT_U32:
7273 case AMDGPU::S_CMP_LE_U32:
7274 case AMDGPU::S_CMP_EQ_U64:
7275 case AMDGPU::S_CMP_LG_U64:
7276 case AMDGPU::S_CMP_LT_F32:
7277 case AMDGPU::S_CMP_EQ_F32:
7278 case AMDGPU::S_CMP_LE_F32:
7279 case AMDGPU::S_CMP_GT_F32:
7280 case AMDGPU::S_CMP_LG_F32:
7281 case AMDGPU::S_CMP_GE_F32:
7282 case AMDGPU::S_CMP_O_F32:
7283 case AMDGPU::S_CMP_U_F32:
7284 case AMDGPU::S_CMP_NGE_F32:
7285 case AMDGPU::S_CMP_NLG_F32:
7286 case AMDGPU::S_CMP_NGT_F32:
7287 case AMDGPU::S_CMP_NLE_F32:
7288 case AMDGPU::S_CMP_NEQ_F32:
7289 case AMDGPU::S_CMP_NLT_F32:
7290 case AMDGPU::S_CMP_LT_F16:
7291 case AMDGPU::S_CMP_EQ_F16:
7292 case AMDGPU::S_CMP_LE_F16:
7293 case AMDGPU::S_CMP_GT_F16:
7294 case AMDGPU::S_CMP_LG_F16:
7295 case AMDGPU::S_CMP_GE_F16:
7296 case AMDGPU::S_CMP_O_F16:
7297 case AMDGPU::S_CMP_U_F16:
7298 case AMDGPU::S_CMP_NGE_F16:
7299 case AMDGPU::S_CMP_NLG_F16:
7300 case AMDGPU::S_CMP_NGT_F16:
7301 case AMDGPU::S_CMP_NLE_F16:
7302 case AMDGPU::S_CMP_NEQ_F16:
7303 case AMDGPU::S_CMP_NLT_F16: {
7304 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7305 auto NewInstr =
7306 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7307 .setMIFlags(Inst.getFlags());
7308 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7309 AMDGPU::OpName::src0_modifiers) >= 0) {
7310 NewInstr
7311 .addImm(0) // src0_modifiers
7312 .add(Inst.getOperand(0)) // src0
7313 .addImm(0) // src1_modifiers
7314 .add(Inst.getOperand(1)) // src1
7315 .addImm(0); // clamp
7316 } else {
7317 NewInstr
7318 .add(Inst.getOperand(0))
7319 .add(Inst.getOperand(1));
7320 }
7321 legalizeOperands(*NewInstr, MDT);
7322 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7323 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7324 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7325 Inst.eraseFromParent();
7326 return;
7327 }
7328 case AMDGPU::S_CVT_HI_F32_F16: {
7329 const DebugLoc &DL = Inst.getDebugLoc();
7330 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7331 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7332 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7333 .addImm(16)
7334 .add(Inst.getOperand(1));
7335 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7336 .addImm(0) // src0_modifiers
7337 .addReg(TmpReg)
7338 .addImm(0) // clamp
7339 .addImm(0); // omod
7340
7341 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7342 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7343 Inst.eraseFromParent();
7344 return;
7345 }
7346 case AMDGPU::S_MINIMUM_F32:
7347 case AMDGPU::S_MAXIMUM_F32:
7348 case AMDGPU::S_MINIMUM_F16:
7349 case AMDGPU::S_MAXIMUM_F16: {
7350 const DebugLoc &DL = Inst.getDebugLoc();
7351 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7352 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7353 .addImm(0) // src0_modifiers
7354 .add(Inst.getOperand(1))
7355 .addImm(0) // src1_modifiers
7356 .add(Inst.getOperand(2))
7357 .addImm(0) // clamp
7358 .addImm(0); // omod
7359 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7360
7361 legalizeOperands(*NewInstr, MDT);
7362 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7363 Inst.eraseFromParent();
7364 return;
7365 }
7366 }
7367
7368 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7369 // We cannot move this instruction to the VALU, so we should try to
7370 // legalize its operands instead.
7371 legalizeOperands(Inst, MDT);
7372 return;
7373 }
7374 // Handle converting generic instructions like COPY-to-SGPR into
7375 // COPY-to-VGPR.
7376 if (NewOpcode == Opcode) {
7377 Register DstReg = Inst.getOperand(0).getReg();
7378 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7379
7380 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7381 // hope for the best.
7382 if (Inst.isCopy() && DstReg.isPhysical() &&
7383 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7384 // TODO: Only works for 32 bit registers.
7385 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7386 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7387 .add(Inst.getOperand(1));
7388 Inst.eraseFromParent();
7389 return;
7390 }
7391
7392 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7393 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7394 // Instead of creating a copy where src and dst are the same register
7395 // class, we just replace all uses of dst with src. These kinds of
7396 // copies interfere with the heuristics MachineSink uses to decide
7397 // whether or not to split a critical edge. Since the pass assumes
7398 // that copies will end up as machine instructions and not be
7399 // eliminated.
7400 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7401 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7402 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7403 Inst.getOperand(0).setReg(DstReg);
7404 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7405 // these are deleted later, but at -O0 it would leave a suspicious
7406 // looking illegal copy of an undef register.
7407 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7408 Inst.removeOperand(I);
7409 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7410 return;
7411 }
7412 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7413 MRI.replaceRegWith(DstReg, NewDstReg);
7414 legalizeOperands(Inst, MDT);
7415 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7416 return;
7417 }
7418
7419 // Use the new VALU Opcode.
7420 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7421 .setMIFlags(Inst.getFlags());
7422 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7423 // Intersperse VOP3 modifiers among the SALU operands.
7424 NewInstr->addOperand(Inst.getOperand(0));
7425 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7426 AMDGPU::OpName::src0_modifiers) >= 0)
7427 NewInstr.addImm(0);
7428 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7429 MachineOperand Src = Inst.getOperand(1);
7430 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7431 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7432 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7433 else
7434 NewInstr->addOperand(Src);
7435 }
7436
7437 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7438 // We are converting these to a BFE, so we need to add the missing
7439 // operands for the size and offset.
7440 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7441 NewInstr.addImm(0);
7442 NewInstr.addImm(Size);
7443 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7444 // The VALU version adds the second operand to the result, so insert an
7445 // extra 0 operand.
7446 NewInstr.addImm(0);
7447 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7448 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7449 // If we need to move this to VGPRs, we need to unpack the second
7450 // operand back into the 2 separate ones for bit offset and width.
7451 assert(OffsetWidthOp.isImm() &&
7452 "Scalar BFE is only implemented for constant width and offset");
7453 uint32_t Imm = OffsetWidthOp.getImm();
7454
7455 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7456 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7457 NewInstr.addImm(Offset);
7458 NewInstr.addImm(BitWidth);
7459 } else {
7460 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7461 AMDGPU::OpName::src1_modifiers) >= 0)
7462 NewInstr.addImm(0);
7463 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7464 NewInstr->addOperand(Inst.getOperand(2));
7465 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7466 AMDGPU::OpName::src2_modifiers) >= 0)
7467 NewInstr.addImm(0);
7468 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7469 NewInstr->addOperand(Inst.getOperand(3));
7470 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7471 NewInstr.addImm(0);
7472 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7473 NewInstr.addImm(0);
7474 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7475 NewInstr.addImm(0);
7476 }
7477 } else {
7478 // Just copy the SALU operands.
7479 for (const MachineOperand &Op : Inst.explicit_operands())
7480 NewInstr->addOperand(Op);
7481 }
7482
7483 // Remove any references to SCC. Vector instructions can't read from it, and
7484 // We're just about to add the implicit use / defs of VCC, and we don't want
7485 // both.
7486 for (MachineOperand &Op : Inst.implicit_operands()) {
7487 if (Op.getReg() == AMDGPU::SCC) {
7488 // Only propagate through live-def of SCC.
7489 if (Op.isDef() && !Op.isDead())
7490 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7491 if (Op.isUse())
7492 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7493 }
7494 }
7495 Inst.eraseFromParent();
7496 Register NewDstReg;
7497 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7498 Register DstReg = NewInstr->getOperand(0).getReg();
7499 assert(DstReg.isVirtual());
7500 // Update the destination register class.
7501 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7502 assert(NewDstRC);
7503 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7504 MRI.replaceRegWith(DstReg, NewDstReg);
7505 }
7506 fixImplicitOperands(*NewInstr);
7507 // Legalize the operands
7508 legalizeOperands(*NewInstr, MDT);
7509 if (NewDstReg)
7510 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7511}
7512
7513// Add/sub require special handling to deal with carry outs.
7514std::pair<bool, MachineBasicBlock *>
7515SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7516 MachineDominatorTree *MDT) const {
7517 if (ST.hasAddNoCarry()) {
7518 // Assume there is no user of scc since we don't select this in that case.
7519 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7520 // is used.
7521
7522 MachineBasicBlock &MBB = *Inst.getParent();
7524
7525 Register OldDstReg = Inst.getOperand(0).getReg();
7526 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7527
7528 unsigned Opc = Inst.getOpcode();
7529 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7530
7531 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7532 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7533
7534 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7535 Inst.removeOperand(3);
7536
7537 Inst.setDesc(get(NewOpc));
7538 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7540 MRI.replaceRegWith(OldDstReg, ResultReg);
7541 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7542
7543 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7544 return std::pair(true, NewBB);
7545 }
7546
7547 return std::pair(false, nullptr);
7548}
7549
7550void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7551 MachineDominatorTree *MDT) const {
7552
7553 MachineBasicBlock &MBB = *Inst.getParent();
7555 MachineBasicBlock::iterator MII = Inst;
7556 DebugLoc DL = Inst.getDebugLoc();
7557
7558 MachineOperand &Dest = Inst.getOperand(0);
7559 MachineOperand &Src0 = Inst.getOperand(1);
7560 MachineOperand &Src1 = Inst.getOperand(2);
7561 MachineOperand &Cond = Inst.getOperand(3);
7562
7563 Register CondReg = Cond.getReg();
7564 bool IsSCC = (CondReg == AMDGPU::SCC);
7565
7566 // If this is a trivial select where the condition is effectively not SCC
7567 // (CondReg is a source of copy to SCC), then the select is semantically
7568 // equivalent to copying CondReg. Hence, there is no need to create
7569 // V_CNDMASK, we can just use that and bail out.
7570 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7571 (Src1.getImm() == 0)) {
7572 MRI.replaceRegWith(Dest.getReg(), CondReg);
7573 return;
7574 }
7575
7576 Register NewCondReg = CondReg;
7577 if (IsSCC) {
7578 const TargetRegisterClass *TC =
7579 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7580 NewCondReg = MRI.createVirtualRegister(TC);
7581
7582 // Now look for the closest SCC def if it is a copy
7583 // replacing the CondReg with the COPY source register
7584 bool CopyFound = false;
7585 for (MachineInstr &CandI :
7587 Inst.getParent()->rend())) {
7588 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7589 -1) {
7590 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7591 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7592 .addReg(CandI.getOperand(1).getReg());
7593 CopyFound = true;
7594 }
7595 break;
7596 }
7597 }
7598 if (!CopyFound) {
7599 // SCC def is not a copy
7600 // Insert a trivial select instead of creating a copy, because a copy from
7601 // SCC would semantically mean just copying a single bit, but we may need
7602 // the result to be a vector condition mask that needs preserving.
7603 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
7604 : AMDGPU::S_CSELECT_B32;
7605 auto NewSelect =
7606 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7607 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7608 }
7609 }
7610
7611 Register NewDestReg = MRI.createVirtualRegister(
7612 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7613 MachineInstr *NewInst;
7614 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7615 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7616 .addImm(0)
7617 .add(Src1) // False
7618 .addImm(0)
7619 .add(Src0) // True
7620 .addReg(NewCondReg);
7621 } else {
7622 NewInst =
7623 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7624 .add(Src1) // False
7625 .add(Src0) // True
7626 .addReg(NewCondReg);
7627 }
7628 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7629 legalizeOperands(*NewInst, MDT);
7630 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7631}
7632
7633void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7634 MachineInstr &Inst) const {
7635 MachineBasicBlock &MBB = *Inst.getParent();
7637 MachineBasicBlock::iterator MII = Inst;
7638 DebugLoc DL = Inst.getDebugLoc();
7639
7640 MachineOperand &Dest = Inst.getOperand(0);
7641 MachineOperand &Src = Inst.getOperand(1);
7642 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7643 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7644
7645 unsigned SubOp = ST.hasAddNoCarry() ?
7646 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7647
7648 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7649 .addImm(0)
7650 .addReg(Src.getReg());
7651
7652 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7653 .addReg(Src.getReg())
7654 .addReg(TmpReg);
7655
7656 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7657 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7658}
7659
7660void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7661 MachineInstr &Inst) const {
7662 MachineBasicBlock &MBB = *Inst.getParent();
7664 MachineBasicBlock::iterator MII = Inst;
7665 const DebugLoc &DL = Inst.getDebugLoc();
7666
7667 MachineOperand &Dest = Inst.getOperand(0);
7668 MachineOperand &Src0 = Inst.getOperand(1);
7669 MachineOperand &Src1 = Inst.getOperand(2);
7670
7671 if (ST.hasDLInsts()) {
7672 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7673 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7674 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7675
7676 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7677 .add(Src0)
7678 .add(Src1);
7679
7680 MRI.replaceRegWith(Dest.getReg(), NewDest);
7681 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7682 } else {
7683 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7684 // invert either source and then perform the XOR. If either source is a
7685 // scalar register, then we can leave the inversion on the scalar unit to
7686 // achieve a better distribution of scalar and vector instructions.
7687 bool Src0IsSGPR = Src0.isReg() &&
7688 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7689 bool Src1IsSGPR = Src1.isReg() &&
7690 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7692 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7693 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7694
7695 // Build a pair of scalar instructions and add them to the work list.
7696 // The next iteration over the work list will lower these to the vector
7697 // unit as necessary.
7698 if (Src0IsSGPR) {
7699 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7700 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7701 .addReg(Temp)
7702 .add(Src1);
7703 } else if (Src1IsSGPR) {
7704 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7705 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7706 .add(Src0)
7707 .addReg(Temp);
7708 } else {
7709 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7710 .add(Src0)
7711 .add(Src1);
7712 MachineInstr *Not =
7713 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7714 Worklist.insert(Not);
7715 }
7716
7717 MRI.replaceRegWith(Dest.getReg(), NewDest);
7718
7719 Worklist.insert(Xor);
7720
7721 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7722 }
7723}
7724
7725void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7726 MachineInstr &Inst,
7727 unsigned Opcode) const {
7728 MachineBasicBlock &MBB = *Inst.getParent();
7730 MachineBasicBlock::iterator MII = Inst;
7731 const DebugLoc &DL = Inst.getDebugLoc();
7732
7733 MachineOperand &Dest = Inst.getOperand(0);
7734 MachineOperand &Src0 = Inst.getOperand(1);
7735 MachineOperand &Src1 = Inst.getOperand(2);
7736
7737 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7738 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7739
7740 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7741 .add(Src0)
7742 .add(Src1);
7743
7744 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7745 .addReg(Interm);
7746
7747 Worklist.insert(&Op);
7748 Worklist.insert(&Not);
7749
7750 MRI.replaceRegWith(Dest.getReg(), NewDest);
7751 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7752}
7753
7754void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7755 MachineInstr &Inst,
7756 unsigned Opcode) const {
7757 MachineBasicBlock &MBB = *Inst.getParent();
7759 MachineBasicBlock::iterator MII = Inst;
7760 const DebugLoc &DL = Inst.getDebugLoc();
7761
7762 MachineOperand &Dest = Inst.getOperand(0);
7763 MachineOperand &Src0 = Inst.getOperand(1);
7764 MachineOperand &Src1 = Inst.getOperand(2);
7765
7766 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7767 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7768
7769 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7770 .add(Src1);
7771
7772 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7773 .add(Src0)
7774 .addReg(Interm);
7775
7776 Worklist.insert(&Not);
7777 Worklist.insert(&Op);
7778
7779 MRI.replaceRegWith(Dest.getReg(), NewDest);
7780 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7781}
7782
7783void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7784 MachineInstr &Inst, unsigned Opcode,
7785 bool Swap) const {
7786 MachineBasicBlock &MBB = *Inst.getParent();
7788
7789 MachineOperand &Dest = Inst.getOperand(0);
7790 MachineOperand &Src0 = Inst.getOperand(1);
7791 DebugLoc DL = Inst.getDebugLoc();
7792
7793 MachineBasicBlock::iterator MII = Inst;
7794
7795 const MCInstrDesc &InstDesc = get(Opcode);
7796 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7797 MRI.getRegClass(Src0.getReg()) :
7798 &AMDGPU::SGPR_32RegClass;
7799
7800 const TargetRegisterClass *Src0SubRC =
7801 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7802
7803 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7804 AMDGPU::sub0, Src0SubRC);
7805
7806 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7807 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7808 const TargetRegisterClass *NewDestSubRC =
7809 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7810
7811 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7812 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7813
7814 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7815 AMDGPU::sub1, Src0SubRC);
7816
7817 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7818 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7819
7820 if (Swap)
7821 std::swap(DestSub0, DestSub1);
7822
7823 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7824 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7825 .addReg(DestSub0)
7826 .addImm(AMDGPU::sub0)
7827 .addReg(DestSub1)
7828 .addImm(AMDGPU::sub1);
7829
7830 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7831
7832 Worklist.insert(&LoHalf);
7833 Worklist.insert(&HiHalf);
7834
7835 // We don't need to legalizeOperands here because for a single operand, src0
7836 // will support any kind of input.
7837
7838 // Move all users of this moved value.
7839 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7840}
7841
7842// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7843// split the s_mul_u64 in 32-bit vector multiplications.
7844void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7845 MachineInstr &Inst,
7846 MachineDominatorTree *MDT) const {
7847 MachineBasicBlock &MBB = *Inst.getParent();
7849
7850 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7851 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7852 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7853
7854 MachineOperand &Dest = Inst.getOperand(0);
7855 MachineOperand &Src0 = Inst.getOperand(1);
7856 MachineOperand &Src1 = Inst.getOperand(2);
7857 const DebugLoc &DL = Inst.getDebugLoc();
7858 MachineBasicBlock::iterator MII = Inst;
7859
7860 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7861 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7862 const TargetRegisterClass *Src0SubRC =
7863 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7864 if (RI.isSGPRClass(Src0SubRC))
7865 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7866 const TargetRegisterClass *Src1SubRC =
7867 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7868 if (RI.isSGPRClass(Src1SubRC))
7869 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7870
7871 // First, we extract the low 32-bit and high 32-bit values from each of the
7872 // operands.
7873 MachineOperand Op0L =
7874 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7875 MachineOperand Op1L =
7876 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7877 MachineOperand Op0H =
7878 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7879 MachineOperand Op1H =
7880 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7881
7882 // The multilication is done as follows:
7883 //
7884 // Op1H Op1L
7885 // * Op0H Op0L
7886 // --------------------
7887 // Op1H*Op0L Op1L*Op0L
7888 // + Op1H*Op0H Op1L*Op0H
7889 // -----------------------------------------
7890 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7891 //
7892 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7893 // value and that would overflow.
7894 // The low 32-bit value is Op1L*Op0L.
7895 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
7896
7897 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7898 MachineInstr *Op1L_Op0H =
7899 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
7900 .add(Op1L)
7901 .add(Op0H);
7902
7903 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7904 MachineInstr *Op1H_Op0L =
7905 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
7906 .add(Op1H)
7907 .add(Op0L);
7908
7909 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7910 MachineInstr *Carry =
7911 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
7912 .add(Op1L)
7913 .add(Op0L);
7914
7915 MachineInstr *LoHalf =
7916 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7917 .add(Op1L)
7918 .add(Op0L);
7919
7920 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7921 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
7922 .addReg(Op1L_Op0H_Reg)
7923 .addReg(Op1H_Op0L_Reg);
7924
7925 MachineInstr *HiHalf =
7926 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
7927 .addReg(AddReg)
7928 .addReg(CarryReg);
7929
7930 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7931 .addReg(DestSub0)
7932 .addImm(AMDGPU::sub0)
7933 .addReg(DestSub1)
7934 .addImm(AMDGPU::sub1);
7935
7936 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7937
7938 // Try to legalize the operands in case we need to swap the order to keep it
7939 // valid.
7940 legalizeOperands(*Op1L_Op0H, MDT);
7941 legalizeOperands(*Op1H_Op0L, MDT);
7942 legalizeOperands(*Carry, MDT);
7943 legalizeOperands(*LoHalf, MDT);
7944 legalizeOperands(*Add, MDT);
7945 legalizeOperands(*HiHalf, MDT);
7946
7947 // Move all users of this moved value.
7948 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7949}
7950
7951// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
7952// multiplications.
7953void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
7954 MachineInstr &Inst,
7955 MachineDominatorTree *MDT) const {
7956 MachineBasicBlock &MBB = *Inst.getParent();
7958
7959 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7960 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7961 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7962
7963 MachineOperand &Dest = Inst.getOperand(0);
7964 MachineOperand &Src0 = Inst.getOperand(1);
7965 MachineOperand &Src1 = Inst.getOperand(2);
7966 const DebugLoc &DL = Inst.getDebugLoc();
7967 MachineBasicBlock::iterator MII = Inst;
7968
7969 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7970 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7971 const TargetRegisterClass *Src0SubRC =
7972 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7973 if (RI.isSGPRClass(Src0SubRC))
7974 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7975 const TargetRegisterClass *Src1SubRC =
7976 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7977 if (RI.isSGPRClass(Src1SubRC))
7978 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7979
7980 // First, we extract the low 32-bit and high 32-bit values from each of the
7981 // operands.
7982 MachineOperand Op0L =
7983 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7984 MachineOperand Op1L =
7985 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7986
7987 unsigned Opc = Inst.getOpcode();
7988 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
7989 ? AMDGPU::V_MUL_HI_U32_e64
7990 : AMDGPU::V_MUL_HI_I32_e64;
7991 MachineInstr *HiHalf =
7992 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
7993
7994 MachineInstr *LoHalf =
7995 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7996 .add(Op1L)
7997 .add(Op0L);
7998
7999 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8000 .addReg(DestSub0)
8001 .addImm(AMDGPU::sub0)
8002 .addReg(DestSub1)
8003 .addImm(AMDGPU::sub1);
8004
8005 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8006
8007 // Try to legalize the operands in case we need to swap the order to keep it
8008 // valid.
8009 legalizeOperands(*HiHalf, MDT);
8010 legalizeOperands(*LoHalf, MDT);
8011
8012 // Move all users of this moved value.
8013 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8014}
8015
8016void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8017 MachineInstr &Inst, unsigned Opcode,
8018 MachineDominatorTree *MDT) const {
8019 MachineBasicBlock &MBB = *Inst.getParent();
8021
8022 MachineOperand &Dest = Inst.getOperand(0);
8023 MachineOperand &Src0 = Inst.getOperand(1);
8024 MachineOperand &Src1 = Inst.getOperand(2);
8025 DebugLoc DL = Inst.getDebugLoc();
8026
8027 MachineBasicBlock::iterator MII = Inst;
8028
8029 const MCInstrDesc &InstDesc = get(Opcode);
8030 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8031 MRI.getRegClass(Src0.getReg()) :
8032 &AMDGPU::SGPR_32RegClass;
8033
8034 const TargetRegisterClass *Src0SubRC =
8035 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8036 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8037 MRI.getRegClass(Src1.getReg()) :
8038 &AMDGPU::SGPR_32RegClass;
8039
8040 const TargetRegisterClass *Src1SubRC =
8041 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8042
8043 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8044 AMDGPU::sub0, Src0SubRC);
8045 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8046 AMDGPU::sub0, Src1SubRC);
8047 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8048 AMDGPU::sub1, Src0SubRC);
8049 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8050 AMDGPU::sub1, Src1SubRC);
8051
8052 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8053 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8054 const TargetRegisterClass *NewDestSubRC =
8055 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8056
8057 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8058 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8059 .add(SrcReg0Sub0)
8060 .add(SrcReg1Sub0);
8061
8062 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8063 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8064 .add(SrcReg0Sub1)
8065 .add(SrcReg1Sub1);
8066
8067 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8068 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8069 .addReg(DestSub0)
8070 .addImm(AMDGPU::sub0)
8071 .addReg(DestSub1)
8072 .addImm(AMDGPU::sub1);
8073
8074 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8075
8076 Worklist.insert(&LoHalf);
8077 Worklist.insert(&HiHalf);
8078
8079 // Move all users of this moved value.
8080 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8081}
8082
8083void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8084 MachineInstr &Inst,
8085 MachineDominatorTree *MDT) const {
8086 MachineBasicBlock &MBB = *Inst.getParent();
8088
8089 MachineOperand &Dest = Inst.getOperand(0);
8090 MachineOperand &Src0 = Inst.getOperand(1);
8091 MachineOperand &Src1 = Inst.getOperand(2);
8092 const DebugLoc &DL = Inst.getDebugLoc();
8093
8094 MachineBasicBlock::iterator MII = Inst;
8095
8096 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8097
8098 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8099
8100 MachineOperand* Op0;
8101 MachineOperand* Op1;
8102
8103 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8104 Op0 = &Src0;
8105 Op1 = &Src1;
8106 } else {
8107 Op0 = &Src1;
8108 Op1 = &Src0;
8109 }
8110
8111 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8112 .add(*Op0);
8113
8114 Register NewDest = MRI.createVirtualRegister(DestRC);
8115
8116 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8117 .addReg(Interm)
8118 .add(*Op1);
8119
8120 MRI.replaceRegWith(Dest.getReg(), NewDest);
8121
8122 Worklist.insert(&Xor);
8123}
8124
8125void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8126 MachineInstr &Inst) const {
8127 MachineBasicBlock &MBB = *Inst.getParent();
8129
8130 MachineBasicBlock::iterator MII = Inst;
8131 const DebugLoc &DL = Inst.getDebugLoc();
8132
8133 MachineOperand &Dest = Inst.getOperand(0);
8134 MachineOperand &Src = Inst.getOperand(1);
8135
8136 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8137 const TargetRegisterClass *SrcRC = Src.isReg() ?
8138 MRI.getRegClass(Src.getReg()) :
8139 &AMDGPU::SGPR_32RegClass;
8140
8141 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8142 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8143
8144 const TargetRegisterClass *SrcSubRC =
8145 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8146
8147 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8148 AMDGPU::sub0, SrcSubRC);
8149 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8150 AMDGPU::sub1, SrcSubRC);
8151
8152 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8153
8154 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8155
8156 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8157
8158 // We don't need to legalize operands here. src0 for either instruction can be
8159 // an SGPR, and the second input is unused or determined here.
8160 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8161}
8162
8163void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8164 MachineInstr &Inst) const {
8165 MachineBasicBlock &MBB = *Inst.getParent();
8167 MachineBasicBlock::iterator MII = Inst;
8168 const DebugLoc &DL = Inst.getDebugLoc();
8169
8170 MachineOperand &Dest = Inst.getOperand(0);
8171 uint32_t Imm = Inst.getOperand(2).getImm();
8172 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8173 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8174
8175 (void) Offset;
8176
8177 // Only sext_inreg cases handled.
8178 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8179 Offset == 0 && "Not implemented");
8180
8181 if (BitWidth < 32) {
8182 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8183 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8184 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8185
8186 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8187 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8188 .addImm(0)
8189 .addImm(BitWidth);
8190
8191 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8192 .addImm(31)
8193 .addReg(MidRegLo);
8194
8195 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8196 .addReg(MidRegLo)
8197 .addImm(AMDGPU::sub0)
8198 .addReg(MidRegHi)
8199 .addImm(AMDGPU::sub1);
8200
8201 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8202 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8203 return;
8204 }
8205
8206 MachineOperand &Src = Inst.getOperand(1);
8207 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8208 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8209
8210 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8211 .addImm(31)
8212 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8213
8214 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8215 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8216 .addImm(AMDGPU::sub0)
8217 .addReg(TmpReg)
8218 .addImm(AMDGPU::sub1);
8219
8220 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8221 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8222}
8223
8224void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8225 MachineInstr &Inst, unsigned Opcode,
8226 MachineDominatorTree *MDT) const {
8227 // (S_FLBIT_I32_B64 hi:lo) ->
8228 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8229 // (S_FF1_I32_B64 hi:lo) ->
8230 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8231
8232 MachineBasicBlock &MBB = *Inst.getParent();
8234 MachineBasicBlock::iterator MII = Inst;
8235 const DebugLoc &DL = Inst.getDebugLoc();
8236
8237 MachineOperand &Dest = Inst.getOperand(0);
8238 MachineOperand &Src = Inst.getOperand(1);
8239
8240 const MCInstrDesc &InstDesc = get(Opcode);
8241
8242 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8243 unsigned OpcodeAdd =
8244 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8245
8246 const TargetRegisterClass *SrcRC =
8247 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8248 const TargetRegisterClass *SrcSubRC =
8249 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8250
8251 MachineOperand SrcRegSub0 =
8252 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8253 MachineOperand SrcRegSub1 =
8254 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8255
8256 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8257 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8258 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8259 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8260
8261 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8262
8263 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8264
8265 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8266 .addReg(IsCtlz ? MidReg1 : MidReg2)
8267 .addImm(32)
8268 .addImm(1); // enable clamp
8269
8270 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8271 .addReg(MidReg3)
8272 .addReg(IsCtlz ? MidReg2 : MidReg1);
8273
8274 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8275
8276 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8277}
8278
8279void SIInstrInfo::addUsersToMoveToVALUWorklist(
8281 SIInstrWorklist &Worklist) const {
8282 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8283 E = MRI.use_end(); I != E;) {
8284 MachineInstr &UseMI = *I->getParent();
8285
8286 unsigned OpNo = 0;
8287
8288 switch (UseMI.getOpcode()) {
8289 case AMDGPU::COPY:
8290 case AMDGPU::WQM:
8291 case AMDGPU::SOFT_WQM:
8292 case AMDGPU::STRICT_WWM:
8293 case AMDGPU::STRICT_WQM:
8294 case AMDGPU::REG_SEQUENCE:
8295 case AMDGPU::PHI:
8296 case AMDGPU::INSERT_SUBREG:
8297 break;
8298 default:
8299 OpNo = I.getOperandNo();
8300 break;
8301 }
8302
8303 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8304 Worklist.insert(&UseMI);
8305
8306 do {
8307 ++I;
8308 } while (I != E && I->getParent() == &UseMI);
8309 } else {
8310 ++I;
8311 }
8312 }
8313}
8314
8315void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8317 MachineInstr &Inst) const {
8318 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8320 MachineOperand &Src0 = Inst.getOperand(1);
8321 MachineOperand &Src1 = Inst.getOperand(2);
8322 const DebugLoc &DL = Inst.getDebugLoc();
8323
8324 switch (Inst.getOpcode()) {
8325 case AMDGPU::S_PACK_LL_B32_B16: {
8326 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8327 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8328
8329 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8330 // 0.
8331 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8332 .addImm(0xffff);
8333
8334 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8335 .addReg(ImmReg, RegState::Kill)
8336 .add(Src0);
8337
8338 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8339 .add(Src1)
8340 .addImm(16)
8341 .addReg(TmpReg, RegState::Kill);
8342 break;
8343 }
8344 case AMDGPU::S_PACK_LH_B32_B16: {
8345 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8346 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8347 .addImm(0xffff);
8348 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8349 .addReg(ImmReg, RegState::Kill)
8350 .add(Src0)
8351 .add(Src1);
8352 break;
8353 }
8354 case AMDGPU::S_PACK_HL_B32_B16: {
8355 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8356 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8357 .addImm(16)
8358 .add(Src0);
8359 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8360 .add(Src1)
8361 .addImm(16)
8362 .addReg(TmpReg, RegState::Kill);
8363 break;
8364 }
8365 case AMDGPU::S_PACK_HH_B32_B16: {
8366 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8367 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8368 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8369 .addImm(16)
8370 .add(Src0);
8371 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8372 .addImm(0xffff0000);
8373 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8374 .add(Src1)
8375 .addReg(ImmReg, RegState::Kill)
8376 .addReg(TmpReg, RegState::Kill);
8377 break;
8378 }
8379 default:
8380 llvm_unreachable("unhandled s_pack_* instruction");
8381 }
8382
8383 MachineOperand &Dest = Inst.getOperand(0);
8384 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8385 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8386}
8387
8388void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8389 MachineInstr &SCCDefInst,
8390 SIInstrWorklist &Worklist,
8391 Register NewCond) const {
8392
8393 // Ensure that def inst defines SCC, which is still live.
8394 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8395 !Op.isDead() && Op.getParent() == &SCCDefInst);
8396 SmallVector<MachineInstr *, 4> CopyToDelete;
8397 // This assumes that all the users of SCC are in the same block
8398 // as the SCC def.
8399 for (MachineInstr &MI : // Skip the def inst itself.
8400 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8401 SCCDefInst.getParent()->end())) {
8402 // Check if SCC is used first.
8403 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8404 if (SCCIdx != -1) {
8405 if (MI.isCopy()) {
8406 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8407 Register DestReg = MI.getOperand(0).getReg();
8408
8409 MRI.replaceRegWith(DestReg, NewCond);
8410 CopyToDelete.push_back(&MI);
8411 } else {
8412
8413 if (NewCond.isValid())
8414 MI.getOperand(SCCIdx).setReg(NewCond);
8415
8416 Worklist.insert(&MI);
8417 }
8418 }
8419 // Exit if we find another SCC def.
8420 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8421 break;
8422 }
8423 for (auto &Copy : CopyToDelete)
8424 Copy->eraseFromParent();
8425}
8426
8427// Instructions that use SCC may be converted to VALU instructions. When that
8428// happens, the SCC register is changed to VCC_LO. The instruction that defines
8429// SCC must be changed to an instruction that defines VCC. This function makes
8430// sure that the instruction that defines SCC is added to the moveToVALU
8431// worklist.
8432void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8433 SIInstrWorklist &Worklist) const {
8434 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8435 // then there is nothing to do because the defining instruction has been
8436 // converted to a VALU already. If SCC then that instruction needs to be
8437 // converted to a VALU.
8438 for (MachineInstr &MI :
8439 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8440 SCCUseInst->getParent()->rend())) {
8441 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8442 break;
8443 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8444 Worklist.insert(&MI);
8445 break;
8446 }
8447 }
8448}
8449
8450const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8451 const MachineInstr &Inst) const {
8452 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8453
8454 switch (Inst.getOpcode()) {
8455 // For target instructions, getOpRegClass just returns the virtual register
8456 // class associated with the operand, so we need to find an equivalent VGPR
8457 // register class in order to move the instruction to the VALU.
8458 case AMDGPU::COPY:
8459 case AMDGPU::PHI:
8460 case AMDGPU::REG_SEQUENCE:
8461 case AMDGPU::INSERT_SUBREG:
8462 case AMDGPU::WQM:
8463 case AMDGPU::SOFT_WQM:
8464 case AMDGPU::STRICT_WWM:
8465 case AMDGPU::STRICT_WQM: {
8466 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8467 if (RI.isAGPRClass(SrcRC)) {
8468 if (RI.isAGPRClass(NewDstRC))
8469 return nullptr;
8470
8471 switch (Inst.getOpcode()) {
8472 case AMDGPU::PHI:
8473 case AMDGPU::REG_SEQUENCE:
8474 case AMDGPU::INSERT_SUBREG:
8475 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8476 break;
8477 default:
8478 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8479 }
8480
8481 if (!NewDstRC)
8482 return nullptr;
8483 } else {
8484 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8485 return nullptr;
8486
8487 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8488 if (!NewDstRC)
8489 return nullptr;
8490 }
8491
8492 return NewDstRC;
8493 }
8494 default:
8495 return NewDstRC;
8496 }
8497}
8498
8499// Find the one SGPR operand we are allowed to use.
8500Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8501 int OpIndices[3]) const {
8502 const MCInstrDesc &Desc = MI.getDesc();
8503
8504 // Find the one SGPR operand we are allowed to use.
8505 //
8506 // First we need to consider the instruction's operand requirements before
8507 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8508 // of VCC, but we are still bound by the constant bus requirement to only use
8509 // one.
8510 //
8511 // If the operand's class is an SGPR, we can never move it.
8512
8513 Register SGPRReg = findImplicitSGPRRead(MI);
8514 if (SGPRReg)
8515 return SGPRReg;
8516
8517 Register UsedSGPRs[3] = {Register()};
8518 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8519
8520 for (unsigned i = 0; i < 3; ++i) {
8521 int Idx = OpIndices[i];
8522 if (Idx == -1)
8523 break;
8524
8525 const MachineOperand &MO = MI.getOperand(Idx);
8526 if (!MO.isReg())
8527 continue;
8528
8529 // Is this operand statically required to be an SGPR based on the operand
8530 // constraints?
8531 const TargetRegisterClass *OpRC =
8532 RI.getRegClass(Desc.operands()[Idx].RegClass);
8533 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8534 if (IsRequiredSGPR)
8535 return MO.getReg();
8536
8537 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8538 Register Reg = MO.getReg();
8539 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8540 if (RI.isSGPRClass(RegRC))
8541 UsedSGPRs[i] = Reg;
8542 }
8543
8544 // We don't have a required SGPR operand, so we have a bit more freedom in
8545 // selecting operands to move.
8546
8547 // Try to select the most used SGPR. If an SGPR is equal to one of the
8548 // others, we choose that.
8549 //
8550 // e.g.
8551 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8552 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8553
8554 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8555 // prefer those.
8556
8557 if (UsedSGPRs[0]) {
8558 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8559 SGPRReg = UsedSGPRs[0];
8560 }
8561
8562 if (!SGPRReg && UsedSGPRs[1]) {
8563 if (UsedSGPRs[1] == UsedSGPRs[2])
8564 SGPRReg = UsedSGPRs[1];
8565 }
8566
8567 return SGPRReg;
8568}
8569
8571 unsigned OperandName) const {
8572 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8573 if (Idx == -1)
8574 return nullptr;
8575
8576 return &MI.getOperand(Idx);
8577}
8578
8584 return (Format << 44) |
8585 (1ULL << 56) | // RESOURCE_LEVEL = 1
8586 (3ULL << 60); // OOB_SELECT = 3
8587 }
8588
8589 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8590 if (ST.isAmdHsaOS()) {
8591 // Set ATC = 1. GFX9 doesn't have this bit.
8593 RsrcDataFormat |= (1ULL << 56);
8594
8595 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8596 // BTW, it disables TC L2 and therefore decreases performance.
8598 RsrcDataFormat |= (2ULL << 59);
8599 }
8600
8601 return RsrcDataFormat;
8602}
8603
8607 0xffffffff; // Size;
8608
8609 // GFX9 doesn't have ELEMENT_SIZE.
8611 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8612 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8613 }
8614
8615 // IndexStride = 64 / 32.
8616 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
8617 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8618
8619 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8620 // Clear them unless we want a huge stride.
8623 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8624
8625 return Rsrc23;
8626}
8627
8629 unsigned Opc = MI.getOpcode();
8630
8631 return isSMRD(Opc);
8632}
8633
8635 return get(Opc).mayLoad() &&
8636 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8637}
8638
8640 int &FrameIndex) const {
8641 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8642 if (!Addr || !Addr->isFI())
8643 return Register();
8644
8645 assert(!MI.memoperands_empty() &&
8646 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8647
8648 FrameIndex = Addr->getIndex();
8649 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8650}
8651
8653 int &FrameIndex) const {
8654 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8655 assert(Addr && Addr->isFI());
8656 FrameIndex = Addr->getIndex();
8657 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8658}
8659
8661 int &FrameIndex) const {
8662 if (!MI.mayLoad())
8663 return Register();
8664
8665 if (isMUBUF(MI) || isVGPRSpill(MI))
8666 return isStackAccess(MI, FrameIndex);
8667
8668 if (isSGPRSpill(MI))
8669 return isSGPRStackAccess(MI, FrameIndex);
8670
8671 return Register();
8672}
8673
8675 int &FrameIndex) const {
8676 if (!MI.mayStore())
8677 return Register();
8678
8679 if (isMUBUF(MI) || isVGPRSpill(MI))
8680 return isStackAccess(MI, FrameIndex);
8681
8682 if (isSGPRSpill(MI))
8683 return isSGPRStackAccess(MI, FrameIndex);
8684
8685 return Register();
8686}
8687
8689 unsigned Size = 0;
8691 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8692 while (++I != E && I->isInsideBundle()) {
8693 assert(!I->isBundle() && "No nested bundle!");
8695 }
8696
8697 return Size;
8698}
8699
8701 unsigned Opc = MI.getOpcode();
8703 unsigned DescSize = Desc.getSize();
8704
8705 // If we have a definitive size, we can use it. Otherwise we need to inspect
8706 // the operands to know the size.
8707 if (isFixedSize(MI)) {
8708 unsigned Size = DescSize;
8709
8710 // If we hit the buggy offset, an extra nop will be inserted in MC so
8711 // estimate the worst case.
8712 if (MI.isBranch() && ST.hasOffset3fBug())
8713 Size += 4;
8714
8715 return Size;
8716 }
8717
8718 // Instructions may have a 32-bit literal encoded after them. Check
8719 // operands that could ever be literals.
8720 if (isVALU(MI) || isSALU(MI)) {
8721 if (isDPP(MI))
8722 return DescSize;
8723 bool HasLiteral = false;
8724 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8725 const MachineOperand &Op = MI.getOperand(I);
8726 const MCOperandInfo &OpInfo = Desc.operands()[I];
8727 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8728 HasLiteral = true;
8729 break;
8730 }
8731 }
8732 return HasLiteral ? DescSize + 4 : DescSize;
8733 }
8734
8735 // Check whether we have extra NSA words.
8736 if (isMIMG(MI)) {
8737 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8738 if (VAddr0Idx < 0)
8739 return 8;
8740
8741 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8742 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8743 }
8744
8745 switch (Opc) {
8746 case TargetOpcode::BUNDLE:
8747 return getInstBundleSize(MI);
8748 case TargetOpcode::INLINEASM:
8749 case TargetOpcode::INLINEASM_BR: {
8750 const MachineFunction *MF = MI.getParent()->getParent();
8751 const char *AsmStr = MI.getOperand(0).getSymbolName();
8752 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8753 }
8754 default:
8755 if (MI.isMetaInstruction())
8756 return 0;
8757 return DescSize;
8758 }
8759}
8760
8762 if (!isFLAT(MI))
8763 return false;
8764
8765 if (MI.memoperands_empty())
8766 return true;
8767
8768 for (const MachineMemOperand *MMO : MI.memoperands()) {
8769 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8770 return true;
8771 }
8772 return false;
8773}
8774
8776 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
8777}
8778
8780 MachineBasicBlock *IfEnd) const {
8782 assert(TI != IfEntry->end());
8783
8784 MachineInstr *Branch = &(*TI);
8785 MachineFunction *MF = IfEntry->getParent();
8787
8788 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8789 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8790 MachineInstr *SIIF =
8791 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
8792 .add(Branch->getOperand(0))
8793 .add(Branch->getOperand(1));
8794 MachineInstr *SIEND =
8795 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8796 .addReg(DstReg);
8797
8798 IfEntry->erase(TI);
8799 IfEntry->insert(IfEntry->end(), SIIF);
8800 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
8801 }
8802}
8803
8805 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
8807 // We expect 2 terminators, one conditional and one unconditional.
8808 assert(TI != LoopEnd->end());
8809
8810 MachineInstr *Branch = &(*TI);
8811 MachineFunction *MF = LoopEnd->getParent();
8813
8814 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8815
8816 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8817 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
8818 MachineInstrBuilder HeaderPHIBuilder =
8819 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
8820 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
8821 if (PMBB == LoopEnd) {
8822 HeaderPHIBuilder.addReg(BackEdgeReg);
8823 } else {
8824 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
8825 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
8826 ZeroReg, 0);
8827 HeaderPHIBuilder.addReg(ZeroReg);
8828 }
8829 HeaderPHIBuilder.addMBB(PMBB);
8830 }
8831 MachineInstr *HeaderPhi = HeaderPHIBuilder;
8832 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
8833 get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
8834 .addReg(DstReg)
8835 .add(Branch->getOperand(0));
8836 MachineInstr *SILOOP =
8837 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
8838 .addReg(BackEdgeReg)
8839 .addMBB(LoopEntry);
8840
8841 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
8842 LoopEnd->erase(TI);
8843 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
8844 LoopEnd->insert(LoopEnd->end(), SILOOP);
8845 }
8846}
8847
8850 static const std::pair<int, const char *> TargetIndices[] = {
8851 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8852 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8853 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8854 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8855 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8856 return ArrayRef(TargetIndices);
8857}
8858
8859/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8860/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8863 const ScheduleDAG *DAG) const {
8864 return new GCNHazardRecognizer(DAG->MF);
8865}
8866
8867/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8868/// pass.
8871 return new GCNHazardRecognizer(MF);
8872}
8873
8874// Called during:
8875// - pre-RA scheduling and post-RA scheduling
8878 const ScheduleDAGMI *DAG) const {
8879 // Borrowed from Arm Target
8880 // We would like to restrict this hazard recognizer to only
8881 // post-RA scheduling; we can tell that we're post-RA because we don't
8882 // track VRegLiveness.
8883 if (!DAG->hasVRegLiveness())
8884 return new GCNHazardRecognizer(DAG->MF);
8886}
8887
8888std::pair<unsigned, unsigned>
8890 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8891}
8892
8895 static const std::pair<unsigned, const char *> TargetFlags[] = {
8896 { MO_GOTPCREL, "amdgpu-gotprel" },
8897 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8898 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8899 { MO_REL32_LO, "amdgpu-rel32-lo" },
8900 { MO_REL32_HI, "amdgpu-rel32-hi" },
8901 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8902 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8903 };
8904
8905 return ArrayRef(TargetFlags);
8906}
8907
8910 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8911 {
8912 {MONoClobber, "amdgpu-noclobber"},
8913 {MOLastUse, "amdgpu-last-use"},
8914 };
8915
8916 return ArrayRef(TargetFlags);
8917}
8918
8920 const MachineFunction &MF) const {
8922 assert(SrcReg.isVirtual());
8923 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8924 return AMDGPU::WWM_COPY;
8925
8926 return AMDGPU::COPY;
8927}
8928
8930 Register Reg) const {
8931 // We need to handle instructions which may be inserted during register
8932 // allocation to handle the prolog. The initial prolog instruction may have
8933 // been separated from the start of the block by spills and copies inserted
8934 // needed by the prolog. However, the insertions for scalar registers can
8935 // always be placed at the BB top as they are independent of the exec mask
8936 // value.
8937 bool IsNullOrVectorRegister = true;
8938 if (Reg) {
8939 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8940 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8941 }
8942
8943 uint16_t Opcode = MI.getOpcode();
8944 // FIXME: Copies inserted in the block prolog for live-range split should also
8945 // be included.
8946 return IsNullOrVectorRegister &&
8947 (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8948 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8949}
8950
8954 const DebugLoc &DL,
8955 Register DestReg) const {
8956 if (ST.hasAddNoCarry())
8957 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8958
8960 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
8961 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
8962
8963 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8964 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8965}
8966
8969 const DebugLoc &DL,
8970 Register DestReg,
8971 RegScavenger &RS) const {
8972 if (ST.hasAddNoCarry())
8973 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
8974
8975 // If available, prefer to use vcc.
8976 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
8977 ? Register(RI.getVCC())
8979 *RI.getBoolRC(), I, /* RestoreAfter */ false,
8980 0, /* AllowSpill */ false);
8981
8982 // TODO: Users need to deal with this.
8983 if (!UnusedCarry.isValid())
8984 return MachineInstrBuilder();
8985
8986 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8987 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8988}
8989
8990bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
8991 switch (Opcode) {
8992 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
8993 case AMDGPU::SI_KILL_I1_TERMINATOR:
8994 return true;
8995 default:
8996 return false;
8997 }
8998}
8999
9001 switch (Opcode) {
9002 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
9003 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
9004 case AMDGPU::SI_KILL_I1_PSEUDO:
9005 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
9006 default:
9007 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
9008 }
9009}
9010
9011bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9012 return Imm <= getMaxMUBUFImmOffset(ST);
9013}
9014
9016 // GFX12 field is non-negative 24-bit signed byte offset.
9017 const unsigned OffsetBits =
9018 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9019 return (1 << OffsetBits) - 1;
9020}
9021
9023 if (!ST.isWave32())
9024 return;
9025
9026 if (MI.isInlineAsm())
9027 return;
9028
9029 for (auto &Op : MI.implicit_operands()) {
9030 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9031 Op.setReg(AMDGPU::VCC_LO);
9032 }
9033}
9034
9036 if (!isSMRD(MI))
9037 return false;
9038
9039 // Check that it is using a buffer resource.
9040 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9041 if (Idx == -1) // e.g. s_memtime
9042 return false;
9043
9044 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9045 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9046}
9047
9048// Given Imm, split it into the values to put into the SOffset and ImmOffset
9049// fields in an MUBUF instruction. Return false if it is not possible (due to a
9050// hardware bug needing a workaround).
9051//
9052// The required alignment ensures that individual address components remain
9053// aligned if they are aligned to begin with. It also ensures that additional
9054// offsets within the given alignment can be added to the resulting ImmOffset.
9056 uint32_t &ImmOffset, Align Alignment) const {
9057 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9058 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9059 uint32_t Overflow = 0;
9060
9061 if (Imm > MaxImm) {
9062 if (Imm <= MaxImm + 64) {
9063 // Use an SOffset inline constant for 4..64
9064 Overflow = Imm - MaxImm;
9065 Imm = MaxImm;
9066 } else {
9067 // Try to keep the same value in SOffset for adjacent loads, so that
9068 // the corresponding register contents can be re-used.
9069 //
9070 // Load values with all low-bits (except for alignment bits) set into
9071 // SOffset, so that a larger range of values can be covered using
9072 // s_movk_i32.
9073 //
9074 // Atomic operations fail to work correctly when individual address
9075 // components are unaligned, even if their sum is aligned.
9076 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9077 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9078 Imm = Low;
9079 Overflow = High - Alignment.value();
9080 }
9081 }
9082
9083 if (Overflow > 0) {
9084 // There is a hardware bug in SI and CI which prevents address clamping in
9085 // MUBUF instructions from working correctly with SOffsets. The immediate
9086 // offset is unaffected.
9088 return false;
9089
9090 // It is not possible to set immediate in SOffset field on some targets.
9091 if (ST.hasRestrictedSOffset())
9092 return false;
9093 }
9094
9095 ImmOffset = Imm;
9096 SOffset = Overflow;
9097 return true;
9098}
9099
9100// Depending on the used address space and instructions, some immediate offsets
9101// are allowed and some are not.
9102// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9103// scratch instruction offsets can also be negative. On GFX12, offsets can be
9104// negative for all variants.
9105//
9106// There are several bugs related to these offsets:
9107// On gfx10.1, flat instructions that go into the global address space cannot
9108// use an offset.
9109//
9110// For scratch instructions, the address can be either an SGPR or a VGPR.
9111// The following offsets can be used, depending on the architecture (x means
9112// cannot be used):
9113// +----------------------------+------+------+
9114// | Address-Mode | SGPR | VGPR |
9115// +----------------------------+------+------+
9116// | gfx9 | | |
9117// | negative, 4-aligned offset | x | ok |
9118// | negative, unaligned offset | x | ok |
9119// +----------------------------+------+------+
9120// | gfx10 | | |
9121// | negative, 4-aligned offset | ok | ok |
9122// | negative, unaligned offset | ok | x |
9123// +----------------------------+------+------+
9124// | gfx10.3 | | |
9125// | negative, 4-aligned offset | ok | ok |
9126// | negative, unaligned offset | ok | ok |
9127// +----------------------------+------+------+
9128//
9129// This function ignores the addressing mode, so if an offset cannot be used in
9130// one addressing mode, it is considered illegal.
9131bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9132 uint64_t FlatVariant) const {
9133 // TODO: Should 0 be special cased?
9134 if (!ST.hasFlatInstOffsets())
9135 return false;
9136
9137 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9138 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9139 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9140 return false;
9141
9143 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9144 (Offset % 4) != 0) {
9145 return false;
9146 }
9147
9148 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9149 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9150 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9151}
9152
9153// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9154std::pair<int64_t, int64_t>
9155SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9156 uint64_t FlatVariant) const {
9157 int64_t RemainderOffset = COffsetVal;
9158 int64_t ImmField = 0;
9159
9160 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9161 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9162
9163 if (AllowNegative) {
9164 // Use signed division by a power of two to truncate towards 0.
9165 int64_t D = 1LL << NumBits;
9166 RemainderOffset = (COffsetVal / D) * D;
9167 ImmField = COffsetVal - RemainderOffset;
9168
9170 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9171 (ImmField % 4) != 0) {
9172 // Make ImmField a multiple of 4
9173 RemainderOffset += ImmField % 4;
9174 ImmField -= ImmField % 4;
9175 }
9176 } else if (COffsetVal >= 0) {
9177 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9178 RemainderOffset = COffsetVal - ImmField;
9179 }
9180
9181 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9182 assert(RemainderOffset + ImmField == COffsetVal);
9183 return {ImmField, RemainderOffset};
9184}
9185
9187 if (ST.hasNegativeScratchOffsetBug() &&
9188 FlatVariant == SIInstrFlags::FlatScratch)
9189 return false;
9190
9191 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9192}
9193
9194static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9195 switch (ST.getGeneration()) {
9196 default:
9197 break;
9200 return SIEncodingFamily::SI;
9203 return SIEncodingFamily::VI;
9210 }
9211 llvm_unreachable("Unknown subtarget generation!");
9212}
9213
9214bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9215 switch(MCOp) {
9216 // These opcodes use indirect register addressing so
9217 // they need special handling by codegen (currently missing).
9218 // Therefore it is too risky to allow these opcodes
9219 // to be selected by dpp combiner or sdwa peepholer.
9220 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9221 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9222 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9223 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9224 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9225 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9226 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9227 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9228 return true;
9229 default:
9230 return false;
9231 }
9232}
9233
9234int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9235 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9236
9237 unsigned Gen = subtargetEncodingFamily(ST);
9238
9239 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
9242
9243 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9244 // subtarget has UnpackedD16VMem feature.
9245 // TODO: remove this when we discard GFX80 encoding.
9246 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9248
9249 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9250 switch (ST.getGeneration()) {
9251 default:
9253 break;
9256 break;
9259 break;
9260 }
9261 }
9262
9263 if (isMAI(Opcode)) {
9264 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9265 if (MFMAOp != -1)
9266 Opcode = MFMAOp;
9267 }
9268
9269 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9270
9271 // -1 means that Opcode is already a native instruction.
9272 if (MCOp == -1)
9273 return Opcode;
9274
9275 if (ST.hasGFX90AInsts()) {
9276 uint16_t NMCOp = (uint16_t)-1;
9277 if (ST.hasGFX940Insts())
9279 if (NMCOp == (uint16_t)-1)
9281 if (NMCOp == (uint16_t)-1)
9283 if (NMCOp != (uint16_t)-1)
9284 MCOp = NMCOp;
9285 }
9286
9287 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9288 // no encoding in the given subtarget generation.
9289 if (MCOp == (uint16_t)-1)
9290 return -1;
9291
9292 if (isAsmOnlyOpcode(MCOp))
9293 return -1;
9294
9295 return MCOp;
9296}
9297
9298static
9300 assert(RegOpnd.isReg());
9301 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9302 getRegSubRegPair(RegOpnd);
9303}
9304
9307 assert(MI.isRegSequence());
9308 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9309 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9310 auto &RegOp = MI.getOperand(1 + 2 * I);
9311 return getRegOrUndef(RegOp);
9312 }
9314}
9315
9316// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9317// Following a subreg of reg:subreg isn't supported
9320 if (!RSR.SubReg)
9321 return false;
9322 switch (MI.getOpcode()) {
9323 default: break;
9324 case AMDGPU::REG_SEQUENCE:
9325 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9326 return true;
9327 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9328 case AMDGPU::INSERT_SUBREG:
9329 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9330 // inserted the subreg we're looking for
9331 RSR = getRegOrUndef(MI.getOperand(2));
9332 else { // the subreg in the rest of the reg
9333 auto R1 = getRegOrUndef(MI.getOperand(1));
9334 if (R1.SubReg) // subreg of subreg isn't supported
9335 return false;
9336 RSR.Reg = R1.Reg;
9337 }
9338 return true;
9339 }
9340 return false;
9341}
9342
9345 assert(MRI.isSSA());
9346 if (!P.Reg.isVirtual())
9347 return nullptr;
9348
9349 auto RSR = P;
9350 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9351 while (auto *MI = DefInst) {
9352 DefInst = nullptr;
9353 switch (MI->getOpcode()) {
9354 case AMDGPU::COPY:
9355 case AMDGPU::V_MOV_B32_e32: {
9356 auto &Op1 = MI->getOperand(1);
9357 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9358 if (Op1.isUndef())
9359 return nullptr;
9360 RSR = getRegSubRegPair(Op1);
9361 DefInst = MRI.getVRegDef(RSR.Reg);
9362 }
9363 break;
9364 }
9365 default:
9366 if (followSubRegDef(*MI, RSR)) {
9367 if (!RSR.Reg)
9368 return nullptr;
9369 DefInst = MRI.getVRegDef(RSR.Reg);
9370 }
9371 }
9372 if (!DefInst)
9373 return MI;
9374 }
9375 return nullptr;
9376}
9377
9379 Register VReg,
9380 const MachineInstr &DefMI,
9381 const MachineInstr &UseMI) {
9382 assert(MRI.isSSA() && "Must be run on SSA");
9383
9384 auto *TRI = MRI.getTargetRegisterInfo();
9385 auto *DefBB = DefMI.getParent();
9386
9387 // Don't bother searching between blocks, although it is possible this block
9388 // doesn't modify exec.
9389 if (UseMI.getParent() != DefBB)
9390 return true;
9391
9392 const int MaxInstScan = 20;
9393 int NumInst = 0;
9394
9395 // Stop scan at the use.
9396 auto E = UseMI.getIterator();
9397 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9398 if (I->isDebugInstr())
9399 continue;
9400
9401 if (++NumInst > MaxInstScan)
9402 return true;
9403
9404 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9405 return true;
9406 }
9407
9408 return false;
9409}
9410
9412 Register VReg,
9413 const MachineInstr &DefMI) {
9414 assert(MRI.isSSA() && "Must be run on SSA");
9415
9416 auto *TRI = MRI.getTargetRegisterInfo();
9417 auto *DefBB = DefMI.getParent();
9418
9419 const int MaxUseScan = 10;
9420 int NumUse = 0;
9421
9422 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9423 auto &UseInst = *Use.getParent();
9424 // Don't bother searching between blocks, although it is possible this block
9425 // doesn't modify exec.
9426 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9427 return true;
9428
9429 if (++NumUse > MaxUseScan)
9430 return true;
9431 }
9432
9433 if (NumUse == 0)
9434 return false;
9435
9436 const int MaxInstScan = 20;
9437 int NumInst = 0;
9438
9439 // Stop scan when we have seen all the uses.
9440 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9441 assert(I != DefBB->end());
9442
9443 if (I->isDebugInstr())
9444 continue;
9445
9446 if (++NumInst > MaxInstScan)
9447 return true;
9448
9449 for (const MachineOperand &Op : I->operands()) {
9450 // We don't check reg masks here as they're used only on calls:
9451 // 1. EXEC is only considered const within one BB
9452 // 2. Call should be a terminator instruction if present in a BB
9453
9454 if (!Op.isReg())
9455 continue;
9456
9457 Register Reg = Op.getReg();
9458 if (Op.isUse()) {
9459 if (Reg == VReg && --NumUse == 0)
9460 return false;
9461 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9462 return true;
9463 }
9464 }
9465}
9466
9469 const DebugLoc &DL, Register Src, Register Dst) const {
9470 auto Cur = MBB.begin();
9471 if (Cur != MBB.end())
9472 do {
9473 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9474 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9475 ++Cur;
9476 } while (Cur != MBB.end() && Cur != LastPHIIt);
9477
9478 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9479 Dst);
9480}
9481
9484 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9485 if (InsPt != MBB.end() &&
9486 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9487 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9488 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9489 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9490 InsPt++;
9491 return BuildMI(MBB, InsPt, DL,
9492 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9493 : AMDGPU::S_MOV_B64_term),
9494 Dst)
9495 .addReg(Src, 0, SrcSubReg)
9496 .addReg(AMDGPU::EXEC, RegState::Implicit);
9497 }
9498 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9499 Dst);
9500}
9501
9502bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9503
9506 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9507 VirtRegMap *VRM) const {
9508 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9509 //
9510 // %0:sreg_32 = COPY $m0
9511 //
9512 // We explicitly chose SReg_32 for the virtual register so such a copy might
9513 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9514 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9515 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9516 // TargetInstrInfo::foldMemoryOperand() is going to try.
9517 // A similar issue also exists with spilling and reloading $exec registers.
9518 //
9519 // To prevent that, constrain the %0 register class here.
9520 if (isFullCopyInstr(MI)) {
9521 Register DstReg = MI.getOperand(0).getReg();
9522 Register SrcReg = MI.getOperand(1).getReg();
9523 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9524 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9526 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9527 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9528 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9529 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9530 return nullptr;
9531 }
9532 if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9533 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9534 return nullptr;
9535 }
9536 }
9537 }
9538
9539 return nullptr;
9540}
9541
9543 const MachineInstr &MI,
9544 unsigned *PredCost) const {
9545 if (MI.isBundle()) {
9547 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9548 unsigned Lat = 0, Count = 0;
9549 for (++I; I != E && I->isBundledWithPred(); ++I) {
9550 ++Count;
9551 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9552 }
9553 return Lat + Count - 1;
9554 }
9555
9556 return SchedModel.computeInstrLatency(&MI);
9557}
9558
9561 unsigned opcode = MI.getOpcode();
9562 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9563 auto IID = GI->getIntrinsicID();
9568
9569 switch (IID) {
9570 case Intrinsic::amdgcn_if:
9571 case Intrinsic::amdgcn_else:
9572 // FIXME: Uniform if second result
9573 break;
9574 }
9575
9577 }
9578
9579 // Loads from the private and flat address spaces are divergent, because
9580 // threads can execute the load instruction with the same inputs and get
9581 // different results.
9582 //
9583 // All other loads are not divergent, because if threads issue loads with the
9584 // same arguments, they will always get the same result.
9585 if (opcode == AMDGPU::G_LOAD) {
9586 if (MI.memoperands_empty())
9587 return InstructionUniformity::NeverUniform; // conservative assumption
9588
9589 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9590 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9591 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9592 })) {
9593 // At least one MMO in a non-global address space.
9595 }
9597 }
9598
9599 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9600 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9601 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9602 AMDGPU::isGenericAtomic(opcode)) {
9604 }
9606}
9607
9610
9611 if (isNeverUniform(MI))
9613
9614 unsigned opcode = MI.getOpcode();
9615 if (opcode == AMDGPU::V_READLANE_B32 ||
9616 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9617 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9619
9620 if (isCopyInstr(MI)) {
9621 const MachineOperand &srcOp = MI.getOperand(1);
9622 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9623 const TargetRegisterClass *regClass =
9624 RI.getPhysRegBaseClass(srcOp.getReg());
9627 }
9629 }
9630
9631 // GMIR handling
9632 if (MI.isPreISelOpcode())
9634
9635 // Atomics are divergent because they are executed sequentially: when an
9636 // atomic operation refers to the same address in each thread, then each
9637 // thread after the first sees the value written by the previous thread as
9638 // original value.
9639
9640 if (isAtomic(MI))
9642
9643 // Loads from the private and flat address spaces are divergent, because
9644 // threads can execute the load instruction with the same inputs and get
9645 // different results.
9646 if (isFLAT(MI) && MI.mayLoad()) {
9647 if (MI.memoperands_empty())
9648 return InstructionUniformity::NeverUniform; // conservative assumption
9649
9650 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9651 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9652 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9653 })) {
9654 // At least one MMO in a non-global address space.
9656 }
9657
9659 }
9660
9661 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9662 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9663
9664 // FIXME: It's conceptually broken to report this for an instruction, and not
9665 // a specific def operand. For inline asm in particular, there could be mixed
9666 // uniform and divergent results.
9667 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9668 const MachineOperand &SrcOp = MI.getOperand(I);
9669 if (!SrcOp.isReg())
9670 continue;
9671
9672 Register Reg = SrcOp.getReg();
9673 if (!Reg || !SrcOp.readsReg())
9674 continue;
9675
9676 // If RegBank is null, this is unassigned or an unallocatable special
9677 // register, which are all scalars.
9678 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9679 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9681 }
9682
9683 // TODO: Uniformity check condtions above can be rearranged for more
9684 // redability
9685
9686 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9687 // currently turned into no-op COPYs by SelectionDAG ISel and are
9688 // therefore no longer recognizable.
9689
9691}
9692
9694 switch (MF.getFunction().getCallingConv()) {
9696 return 1;
9698 return 2;
9700 return 3;
9704 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9707 case CallingConv::C:
9708 case CallingConv::Fast:
9709 default:
9710 // Assume other calling conventions are various compute callable functions
9711 return 0;
9712 }
9713}
9714
9716 Register &SrcReg2, int64_t &CmpMask,
9717 int64_t &CmpValue) const {
9718 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9719 return false;
9720
9721 switch (MI.getOpcode()) {
9722 default:
9723 break;
9724 case AMDGPU::S_CMP_EQ_U32:
9725 case AMDGPU::S_CMP_EQ_I32:
9726 case AMDGPU::S_CMP_LG_U32:
9727 case AMDGPU::S_CMP_LG_I32:
9728 case AMDGPU::S_CMP_LT_U32:
9729 case AMDGPU::S_CMP_LT_I32:
9730 case AMDGPU::S_CMP_GT_U32:
9731 case AMDGPU::S_CMP_GT_I32:
9732 case AMDGPU::S_CMP_LE_U32:
9733 case AMDGPU::S_CMP_LE_I32:
9734 case AMDGPU::S_CMP_GE_U32:
9735 case AMDGPU::S_CMP_GE_I32:
9736 case AMDGPU::S_CMP_EQ_U64:
9737 case AMDGPU::S_CMP_LG_U64:
9738 SrcReg = MI.getOperand(0).getReg();
9739 if (MI.getOperand(1).isReg()) {
9740 if (MI.getOperand(1).getSubReg())
9741 return false;
9742 SrcReg2 = MI.getOperand(1).getReg();
9743 CmpValue = 0;
9744 } else if (MI.getOperand(1).isImm()) {
9745 SrcReg2 = Register();
9746 CmpValue = MI.getOperand(1).getImm();
9747 } else {
9748 return false;
9749 }
9750 CmpMask = ~0;
9751 return true;
9752 case AMDGPU::S_CMPK_EQ_U32:
9753 case AMDGPU::S_CMPK_EQ_I32:
9754 case AMDGPU::S_CMPK_LG_U32:
9755 case AMDGPU::S_CMPK_LG_I32:
9756 case AMDGPU::S_CMPK_LT_U32:
9757 case AMDGPU::S_CMPK_LT_I32:
9758 case AMDGPU::S_CMPK_GT_U32:
9759 case AMDGPU::S_CMPK_GT_I32:
9760 case AMDGPU::S_CMPK_LE_U32:
9761 case AMDGPU::S_CMPK_LE_I32:
9762 case AMDGPU::S_CMPK_GE_U32:
9763 case AMDGPU::S_CMPK_GE_I32:
9764 SrcReg = MI.getOperand(0).getReg();
9765 SrcReg2 = Register();
9766 CmpValue = MI.getOperand(1).getImm();
9767 CmpMask = ~0;
9768 return true;
9769 }
9770
9771 return false;
9772}
9773
9775 Register SrcReg2, int64_t CmpMask,
9776 int64_t CmpValue,
9777 const MachineRegisterInfo *MRI) const {
9778 if (!SrcReg || SrcReg.isPhysical())
9779 return false;
9780
9781 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9782 return false;
9783
9784 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9785 this](int64_t ExpectedValue, unsigned SrcSize,
9786 bool IsReversible, bool IsSigned) -> bool {
9787 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9788 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9789 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9790 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9791 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9792 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9793 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9794 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9795 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9796 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9797 //
9798 // Signed ge/gt are not used for the sign bit.
9799 //
9800 // If result of the AND is unused except in the compare:
9801 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9802 //
9803 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9804 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9805 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9806 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9807 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9808 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9809
9810 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9811 if (!Def || Def->getParent() != CmpInstr.getParent())
9812 return false;
9813
9814 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9815 Def->getOpcode() != AMDGPU::S_AND_B64)
9816 return false;
9817
9818 int64_t Mask;
9819 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9820 if (MO->isImm())
9821 Mask = MO->getImm();
9822 else if (!getFoldableImm(MO, Mask))
9823 return false;
9824 Mask &= maxUIntN(SrcSize);
9825 return isPowerOf2_64(Mask);
9826 };
9827
9828 MachineOperand *SrcOp = &Def->getOperand(1);
9829 if (isMask(SrcOp))
9830 SrcOp = &Def->getOperand(2);
9831 else if (isMask(&Def->getOperand(2)))
9832 SrcOp = &Def->getOperand(1);
9833 else
9834 return false;
9835
9836 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9837 if (IsSigned && BitNo == SrcSize - 1)
9838 return false;
9839
9840 ExpectedValue <<= BitNo;
9841
9842 bool IsReversedCC = false;
9843 if (CmpValue != ExpectedValue) {
9844 if (!IsReversible)
9845 return false;
9846 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9847 if (!IsReversedCC)
9848 return false;
9849 }
9850
9851 Register DefReg = Def->getOperand(0).getReg();
9852 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9853 return false;
9854
9855 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9856 I != E; ++I) {
9857 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9858 I->killsRegister(AMDGPU::SCC, &RI))
9859 return false;
9860 }
9861
9862 MachineOperand *SccDef =
9863 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9864 SccDef->setIsDead(false);
9865 CmpInstr.eraseFromParent();
9866
9867 if (!MRI->use_nodbg_empty(DefReg)) {
9868 assert(!IsReversedCC);
9869 return true;
9870 }
9871
9872 // Replace AND with unused result with a S_BITCMP.
9873 MachineBasicBlock *MBB = Def->getParent();
9874
9875 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9876 : AMDGPU::S_BITCMP1_B32
9877 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9878 : AMDGPU::S_BITCMP1_B64;
9879
9880 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9881 .add(*SrcOp)
9882 .addImm(BitNo);
9883 Def->eraseFromParent();
9884
9885 return true;
9886 };
9887
9888 switch (CmpInstr.getOpcode()) {
9889 default:
9890 break;
9891 case AMDGPU::S_CMP_EQ_U32:
9892 case AMDGPU::S_CMP_EQ_I32:
9893 case AMDGPU::S_CMPK_EQ_U32:
9894 case AMDGPU::S_CMPK_EQ_I32:
9895 return optimizeCmpAnd(1, 32, true, false);
9896 case AMDGPU::S_CMP_GE_U32:
9897 case AMDGPU::S_CMPK_GE_U32:
9898 return optimizeCmpAnd(1, 32, false, false);
9899 case AMDGPU::S_CMP_GE_I32:
9900 case AMDGPU::S_CMPK_GE_I32:
9901 return optimizeCmpAnd(1, 32, false, true);
9902 case AMDGPU::S_CMP_EQ_U64:
9903 return optimizeCmpAnd(1, 64, true, false);
9904 case AMDGPU::S_CMP_LG_U32:
9905 case AMDGPU::S_CMP_LG_I32:
9906 case AMDGPU::S_CMPK_LG_U32:
9907 case AMDGPU::S_CMPK_LG_I32:
9908 return optimizeCmpAnd(0, 32, true, false);
9909 case AMDGPU::S_CMP_GT_U32:
9910 case AMDGPU::S_CMPK_GT_U32:
9911 return optimizeCmpAnd(0, 32, false, false);
9912 case AMDGPU::S_CMP_GT_I32:
9913 case AMDGPU::S_CMPK_GT_I32:
9914 return optimizeCmpAnd(0, 32, false, true);
9915 case AMDGPU::S_CMP_LG_U64:
9916 return optimizeCmpAnd(0, 64, true, false);
9917 }
9918
9919 return false;
9920}
9921
9923 unsigned OpName) const {
9924 if (!ST.needsAlignedVGPRs())
9925 return;
9926
9927 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
9928 if (OpNo < 0)
9929 return;
9930 MachineOperand &Op = MI.getOperand(OpNo);
9931 if (getOpSize(MI, OpNo) > 4)
9932 return;
9933
9934 // Add implicit aligned super-reg to force alignment on the data operand.
9935 const DebugLoc &DL = MI.getDebugLoc();
9936 MachineBasicBlock *BB = MI.getParent();
9938 Register DataReg = Op.getReg();
9939 bool IsAGPR = RI.isAGPR(MRI, DataReg);
9940 Register Undef = MRI.createVirtualRegister(
9941 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
9942 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
9943 Register NewVR =
9944 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
9945 : &AMDGPU::VReg_64_Align2RegClass);
9946 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
9947 .addReg(DataReg, 0, Op.getSubReg())
9948 .addImm(AMDGPU::sub0)
9949 .addReg(Undef)
9950 .addImm(AMDGPU::sub1);
9951 Op.setReg(NewVR);
9952 Op.setSubReg(AMDGPU::sub0);
9953 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
9954}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
amdgpu AMDGPU Register Bank Select
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
uint64_t IntrinsicInst * II
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:82
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static bool isRegOrFI(const MachineOperand &MO)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:73
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:78
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1520
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:281
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:380
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:745
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:749
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:999
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:391
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:627
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:765
bool hasMAIInsts() const
Definition: GCNSubtarget.h:815
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:278
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:298
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:761
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:680
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:753
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:344
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasNoF16PseudoScalarTransInlineConstants() const
Generation getGeneration() const
Definition: GCNSubtarget.h:317
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:924
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:732
bool hasAddr64() const
Definition: GCNSubtarget.h:381
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:724
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:537
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:607
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:617
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:193
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned getNumDefs() const
Return the number of MachineOperands that are register definitions.
Definition: MCInstrDesc.h:248
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:393
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:41
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineDomTreeNode * addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *DomBB)
addNewBlock - Add a new node to the dominator tree information.
bool properlyDominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
void changeImmediateDominator(MachineBasicBlock *N, MachineBasicBlock *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:569
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:346
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:572
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:691
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:815
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:800
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:782
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:498
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:699
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:579
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:391
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
bool addRegisterDead(Register Reg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound=false)
We have determined MI defined a register without a use.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
bool isNonUniformBranchInstr(MachineInstr &Instr) const
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:924
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1152
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1280
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:604
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
Definition: SIInstrInfo.h:740
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:965
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
static bool isF16PseudoScalarTrans(unsigned Opcode)
Definition: SIInstrInfo.h:949
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:716
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:1011
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineBasicBlock *IfEnd) const
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
This function is used to determine if an instruction can be safely executed under EXEC = 0 without ha...
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:681
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:863
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:728
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
bool isBarrier(unsigned Opcode) const
Definition: SIInstrInfo.h:939
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1293
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:880
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:63
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:577
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:65
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:237
SlotIndexes pass.
Definition: SlotIndexes.h:297
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:531
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1571
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
bool isHi16Reg(MCRegister Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1572
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1574
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:452
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:454
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:451
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:453
@ TI_CONSTDATA_START
Definition: AMDGPU.h:450
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1573
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:138
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:480
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1462
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:555
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:154
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:159
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:260
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1886
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:219
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:238
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:83
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:49
MachineInstr * top() const
Definition: SIInstrInfo.h:54
bool empty() const
Definition: SIInstrInfo.h:64
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:73
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.