LLVM 22.0.0git
SIPeepholeSDWA.cpp
Go to the documentation of this file.
1//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass tries to apply several peephole SDWA patterns.
10///
11/// E.g. original:
12/// V_LSHRREV_B32_e32 %0, 16, %1
13/// V_ADD_CO_U32_e32 %2, %0, %3
14/// V_LSHLREV_B32_e32 %4, 16, %2
15///
16/// Replace:
17/// V_ADD_CO_U32_sdwa %4, %1, %3
18/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19///
20//===----------------------------------------------------------------------===//
21
22#include "SIPeepholeSDWA.h"
23#include "AMDGPU.h"
24#include "GCNSubtarget.h"
26#include "llvm/ADT/MapVector.h"
27#include "llvm/ADT/Statistic.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "si-peephole-sdwa"
34
35STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
36STATISTIC(NumSDWAInstructionsPeepholed,
37 "Number of instruction converted to SDWA.");
38
39namespace {
40
41bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST,
42 const SIInstrInfo *TII);
43class SDWAOperand;
44class SDWADstOperand;
45
46using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
48
49class SIPeepholeSDWA {
50private:
52 const SIRegisterInfo *TRI;
53 const SIInstrInfo *TII;
54
56 SDWAOperandsMap PotentialMatches;
57 SmallVector<MachineInstr *, 8> ConvertedInstructions;
58
59 std::optional<int64_t> foldToImm(const MachineOperand &Op) const;
60
61 void matchSDWAOperands(MachineBasicBlock &MBB);
62 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
63 void pseudoOpConvertToVOP2(MachineInstr &MI,
64 const GCNSubtarget &ST) const;
65 void convertVcndmaskToVOP2(MachineInstr &MI, const GCNSubtarget &ST) const;
66 MachineInstr *createSDWAVersion(MachineInstr &MI);
67 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
68 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
69
70public:
71 bool run(MachineFunction &MF);
72};
73
74class SIPeepholeSDWALegacy : public MachineFunctionPass {
75public:
76 static char ID;
77
78 SIPeepholeSDWALegacy() : MachineFunctionPass(ID) {}
79
80 StringRef getPassName() const override { return "SI Peephole SDWA"; }
81
82 bool runOnMachineFunction(MachineFunction &MF) override;
83
84 void getAnalysisUsage(AnalysisUsage &AU) const override {
85 AU.setPreservesCFG();
87 }
88};
89
90using namespace AMDGPU::SDWA;
91
92class SDWAOperand {
93private:
94 MachineOperand *Target; // Operand that would be used in converted instruction
95 MachineOperand *Replaced; // Operand that would be replace by Target
96
97 /// Returns true iff the SDWA selection of this SDWAOperand can be combined
98 /// with the SDWA selections of its uses in \p MI.
99 virtual bool canCombineSelections(const MachineInstr &MI,
100 const SIInstrInfo *TII) = 0;
101
102public:
103 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
104 : Target(TargetOp), Replaced(ReplacedOp) {
105 assert(Target->isReg());
106 assert(Replaced->isReg());
107 }
108
109 virtual ~SDWAOperand() = default;
110
111 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII,
112 const GCNSubtarget &ST,
113 SDWAOperandsMap *PotentialMatches = nullptr) = 0;
114 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
115
116 MachineOperand *getTargetOperand() const { return Target; }
117 MachineOperand *getReplacedOperand() const { return Replaced; }
118 MachineInstr *getParentInst() const { return Target->getParent(); }
119
120 MachineRegisterInfo *getMRI() const {
121 return &getParentInst()->getParent()->getParent()->getRegInfo();
122 }
123
124#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
125 virtual void print(raw_ostream& OS) const = 0;
126 void dump() const { print(dbgs()); }
127#endif
128};
129
130class SDWASrcOperand : public SDWAOperand {
131private:
132 SdwaSel SrcSel;
133 bool Abs;
134 bool Neg;
135 bool Sext;
136
137public:
138 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
139 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
140 bool Sext_ = false)
141 : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
142 Neg(Neg_), Sext(Sext_) {}
143
144 MachineInstr *potentialToConvert(const SIInstrInfo *TII,
145 const GCNSubtarget &ST,
146 SDWAOperandsMap *PotentialMatches = nullptr) override;
147 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
148 bool canCombineSelections(const MachineInstr &MI,
149 const SIInstrInfo *TII) override;
150
151 SdwaSel getSrcSel() const { return SrcSel; }
152 bool getAbs() const { return Abs; }
153 bool getNeg() const { return Neg; }
154 bool getSext() const { return Sext; }
155
156 uint64_t getSrcMods(const SIInstrInfo *TII,
157 const MachineOperand *SrcOp) const;
158
159#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
160 void print(raw_ostream& OS) const override;
161#endif
162};
163
164class SDWADstOperand : public SDWAOperand {
165private:
166 SdwaSel DstSel;
167 DstUnused DstUn;
168
169public:
170 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
171 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
172 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
173
174 MachineInstr *potentialToConvert(const SIInstrInfo *TII,
175 const GCNSubtarget &ST,
176 SDWAOperandsMap *PotentialMatches = nullptr) override;
177 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
178 bool canCombineSelections(const MachineInstr &MI,
179 const SIInstrInfo *TII) override;
180
181 SdwaSel getDstSel() const { return DstSel; }
182 DstUnused getDstUnused() const { return DstUn; }
183
184#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
185 void print(raw_ostream& OS) const override;
186#endif
187};
188
189class SDWADstPreserveOperand : public SDWADstOperand {
190private:
191 MachineOperand *Preserve;
192
193public:
194 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
195 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
196 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
197 Preserve(PreserveOp) {}
198
199 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
200 bool canCombineSelections(const MachineInstr &MI,
201 const SIInstrInfo *TII) override;
202
203 MachineOperand *getPreservedOperand() const { return Preserve; }
204
205#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
206 void print(raw_ostream& OS) const override;
207#endif
208};
209
210} // end anonymous namespace
211
212INITIALIZE_PASS(SIPeepholeSDWALegacy, DEBUG_TYPE, "SI Peephole SDWA", false,
213 false)
214
215char SIPeepholeSDWALegacy::ID = 0;
216
217char &llvm::SIPeepholeSDWALegacyID = SIPeepholeSDWALegacy::ID;
218
220 return new SIPeepholeSDWALegacy();
221}
222
223#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
225 switch(Sel) {
226 case BYTE_0: OS << "BYTE_0"; break;
227 case BYTE_1: OS << "BYTE_1"; break;
228 case BYTE_2: OS << "BYTE_2"; break;
229 case BYTE_3: OS << "BYTE_3"; break;
230 case WORD_0: OS << "WORD_0"; break;
231 case WORD_1: OS << "WORD_1"; break;
232 case DWORD: OS << "DWORD"; break;
233 }
234 return OS;
235}
236
238 switch(Un) {
239 case UNUSED_PAD: OS << "UNUSED_PAD"; break;
240 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
241 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
242 }
243 return OS;
244}
245
247void SDWASrcOperand::print(raw_ostream& OS) const {
248 OS << "SDWA src: " << *getTargetOperand()
249 << " src_sel:" << getSrcSel()
250 << " abs:" << getAbs() << " neg:" << getNeg()
251 << " sext:" << getSext() << '\n';
252}
253
255void SDWADstOperand::print(raw_ostream& OS) const {
256 OS << "SDWA dst: " << *getTargetOperand()
257 << " dst_sel:" << getDstSel()
258 << " dst_unused:" << getDstUnused() << '\n';
259}
260
262void SDWADstPreserveOperand::print(raw_ostream& OS) const {
263 OS << "SDWA preserve dst: " << *getTargetOperand()
264 << " dst_sel:" << getDstSel()
265 << " preserve:" << *getPreservedOperand() << '\n';
266}
267
268#endif
269
270static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
271 assert(To.isReg() && From.isReg());
272 To.setReg(From.getReg());
273 To.setSubReg(From.getSubReg());
274 To.setIsUndef(From.isUndef());
275 if (To.isUse()) {
276 To.setIsKill(From.isKill());
277 } else {
278 To.setIsDead(From.isDead());
279 }
280}
281
282static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
283 return LHS.isReg() &&
284 RHS.isReg() &&
285 LHS.getReg() == RHS.getReg() &&
286 LHS.getSubReg() == RHS.getSubReg();
287}
288
290 const MachineRegisterInfo *MRI) {
291 if (!Reg->isReg() || !Reg->isDef())
292 return nullptr;
293
294 return MRI->getOneNonDBGUse(Reg->getReg());
295}
296
298 const MachineRegisterInfo *MRI) {
299 if (!Reg->isReg())
300 return nullptr;
301
302 return MRI->getOneDef(Reg->getReg());
303}
304
305/// Combine an SDWA instruction's existing SDWA selection \p Sel with
306/// the SDWA selection \p OperandSel of its operand. If the selections
307/// are compatible, return the combined selection, otherwise return a
308/// nullopt.
309/// For example, if we have Sel = BYTE_0 Sel and OperandSel = WORD_1:
310/// BYTE_0 Sel (WORD_1 Sel (%X)) -> BYTE_2 Sel (%X)
311static std::optional<SdwaSel> combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel) {
312 if (Sel == SdwaSel::DWORD)
313 return OperandSel;
314
315 if (Sel == OperandSel || OperandSel == SdwaSel::DWORD)
316 return Sel;
317
318 if (Sel == SdwaSel::WORD_1 || Sel == SdwaSel::BYTE_2 ||
319 Sel == SdwaSel::BYTE_3)
320 return {};
321
322 if (OperandSel == SdwaSel::WORD_0)
323 return Sel;
324
325 if (OperandSel == SdwaSel::WORD_1) {
326 if (Sel == SdwaSel::BYTE_0)
327 return SdwaSel::BYTE_2;
328 if (Sel == SdwaSel::BYTE_1)
329 return SdwaSel::BYTE_3;
330 if (Sel == SdwaSel::WORD_0)
331 return SdwaSel::WORD_1;
332 }
333
334 return {};
335}
336
337uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
338 const MachineOperand *SrcOp) const {
339 uint64_t Mods = 0;
340 const auto *MI = SrcOp->getParent();
341 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
342 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
343 Mods = Mod->getImm();
344 }
345 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
346 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
347 Mods = Mod->getImm();
348 }
349 }
350 if (Abs || Neg) {
351 assert(!Sext &&
352 "Float and integer src modifiers can't be set simultaneously");
353 Mods |= Abs ? SISrcMods::ABS : 0u;
354 Mods ^= Neg ? SISrcMods::NEG : 0u;
355 } else if (Sext) {
356 Mods |= SISrcMods::SEXT;
357 }
358
359 return Mods;
360}
361
362MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII,
363 const GCNSubtarget &ST,
364 SDWAOperandsMap *PotentialMatches) {
365 if (PotentialMatches != nullptr) {
366 // Fill out the map for all uses if all can be converted
367 MachineOperand *Reg = getReplacedOperand();
368 if (!Reg->isReg() || !Reg->isDef())
369 return nullptr;
370
371 for (MachineInstr &UseMI : getMRI()->use_nodbg_instructions(Reg->getReg()))
372 // Check that all instructions that use Reg can be converted
373 if (!isConvertibleToSDWA(UseMI, ST, TII) ||
374 !canCombineSelections(UseMI, TII))
375 return nullptr;
376
377 // Now that it's guaranteed all uses are legal, iterate over the uses again
378 // to add them for later conversion.
379 for (MachineOperand &UseMO : getMRI()->use_nodbg_operands(Reg->getReg())) {
380 // Should not get a subregister here
381 assert(isSameReg(UseMO, *Reg));
382
383 SDWAOperandsMap &potentialMatchesMap = *PotentialMatches;
384 MachineInstr *UseMI = UseMO.getParent();
385 potentialMatchesMap[UseMI].push_back(this);
386 }
387 return nullptr;
388 }
389
390 // For SDWA src operand potential instruction is one that use register
391 // defined by parent instruction
392 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
393 if (!PotentialMO)
394 return nullptr;
395
396 MachineInstr *Parent = PotentialMO->getParent();
397
398 return canCombineSelections(*Parent, TII) ? Parent : nullptr;
399}
400
401bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
402 switch (MI.getOpcode()) {
403 case AMDGPU::V_CVT_F32_FP8_sdwa:
404 case AMDGPU::V_CVT_F32_BF8_sdwa:
405 case AMDGPU::V_CVT_PK_F32_FP8_sdwa:
406 case AMDGPU::V_CVT_PK_F32_BF8_sdwa:
407 // Does not support input modifiers: noabs, noneg, nosext.
408 return false;
409 case AMDGPU::V_CNDMASK_B32_sdwa:
410 // SISrcMods uses the same bitmask for SEXT and NEG modifiers and
411 // hence the compiler can only support one type of modifier for
412 // each SDWA instruction. For V_CNDMASK_B32_sdwa, this is NEG
413 // since its operands get printed using
414 // AMDGPUInstPrinter::printOperandAndFPInputMods which produces
415 // the output intended for NEG if SEXT is set.
416 //
417 // The ISA does actually support both modifiers on most SDWA
418 // instructions.
419 //
420 // FIXME Accept SEXT here after fixing this issue.
421 if (Sext)
422 return false;
423 break;
424 }
425
426 // Find operand in instruction that matches source operand and replace it with
427 // target operand. Set corresponding src_sel
428 bool IsPreserveSrc = false;
429 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
430 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
431 MachineOperand *SrcMods =
432 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
433 assert(Src && (Src->isReg() || Src->isImm()));
434 if (!isSameReg(*Src, *getReplacedOperand())) {
435 // If this is not src0 then it could be src1
436 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
437 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
438 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
439
440 if (!Src ||
441 !isSameReg(*Src, *getReplacedOperand())) {
442 // It's possible this Src is a tied operand for
443 // UNUSED_PRESERVE, in which case we can either
444 // abandon the peephole attempt, or if legal we can
445 // copy the target operand into the tied slot
446 // if the preserve operation will effectively cause the same
447 // result by overwriting the rest of the dst.
448 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
449 MachineOperand *DstUnused =
450 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
451
452 if (Dst &&
453 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
454 // This will work if the tied src is accessing WORD_0, and the dst is
455 // writing WORD_1. Modifiers don't matter because all the bits that
456 // would be impacted are being overwritten by the dst.
457 // Any other case will not work.
458 SdwaSel DstSel = static_cast<SdwaSel>(
459 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
460 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
461 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
462 IsPreserveSrc = true;
463 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
464 AMDGPU::OpName::vdst);
465 auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
466 Src = &MI.getOperand(TiedIdx);
467 SrcSel = nullptr;
468 SrcMods = nullptr;
469 } else {
470 // Not legal to convert this src
471 return false;
472 }
473 }
474 }
475 assert(Src && Src->isReg());
476
477 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
478 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
479 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
480 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
481 !isSameReg(*Src, *getReplacedOperand())) {
482 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
483 // src2. This is not allowed.
484 return false;
485 }
486
487 assert(isSameReg(*Src, *getReplacedOperand()) &&
488 (IsPreserveSrc || (SrcSel && SrcMods)));
489 }
490 copyRegOperand(*Src, *getTargetOperand());
491 if (!IsPreserveSrc) {
492 SdwaSel ExistingSel = static_cast<SdwaSel>(SrcSel->getImm());
493 SrcSel->setImm(*combineSdwaSel(ExistingSel, getSrcSel()));
494 SrcMods->setImm(getSrcMods(TII, Src));
495 }
496 getTargetOperand()->setIsKill(false);
497 return true;
498}
499
500/// Verify that the SDWA selection operand \p SrcSelOpName of the SDWA
501/// instruction \p MI can be combined with the selection \p OpSel.
502static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
503 AMDGPU::OpName SrcSelOpName, SdwaSel OpSel) {
504 assert(TII->isSDWA(MI.getOpcode()));
505
506 const MachineOperand *SrcSelOp = TII->getNamedOperand(MI, SrcSelOpName);
507 SdwaSel SrcSel = static_cast<SdwaSel>(SrcSelOp->getImm());
508
509 return combineSdwaSel(SrcSel, OpSel).has_value();
510}
511
512/// Verify that \p Op is the same register as the operand of the SDWA
513/// instruction \p MI named by \p SrcOpName and that the SDWA
514/// selection \p SrcSelOpName can be combined with the \p OpSel.
515static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII,
516 AMDGPU::OpName SrcOpName,
517 AMDGPU::OpName SrcSelOpName, MachineOperand *Op,
518 SdwaSel OpSel) {
519 assert(TII->isSDWA(MI.getOpcode()));
520
521 const MachineOperand *Src = TII->getNamedOperand(MI, SrcOpName);
522 if (!Src || !isSameReg(*Src, *Op))
523 return true;
524
525 return canCombineOpSel(MI, TII, SrcSelOpName, OpSel);
526}
527
528bool SDWASrcOperand::canCombineSelections(const MachineInstr &MI,
529 const SIInstrInfo *TII) {
530 if (!TII->isSDWA(MI.getOpcode()))
531 return true;
532
533 using namespace AMDGPU;
534
535 return canCombineOpSel(MI, TII, OpName::src0, OpName::src0_sel,
536 getReplacedOperand(), getSrcSel()) &&
537 canCombineOpSel(MI, TII, OpName::src1, OpName::src1_sel,
538 getReplacedOperand(), getSrcSel());
539}
540
541MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII,
542 const GCNSubtarget &ST,
543 SDWAOperandsMap *PotentialMatches) {
544 // For SDWA dst operand potential instruction is one that defines register
545 // that this operand uses
546 MachineRegisterInfo *MRI = getMRI();
547 MachineInstr *ParentMI = getParentInst();
548
549 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
550 if (!PotentialMO)
551 return nullptr;
552
553 // Check that ParentMI is the only instruction that uses replaced register
554 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
555 if (&UseInst != ParentMI)
556 return nullptr;
557 }
558
559 MachineInstr *Parent = PotentialMO->getParent();
560 return canCombineSelections(*Parent, TII) ? Parent : nullptr;
561}
562
563bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
564 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
565
566 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
567 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
568 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
569 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
570 getDstSel() != AMDGPU::SDWA::DWORD) {
571 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
572 return false;
573 }
574
575 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
576 assert(Operand &&
577 Operand->isReg() &&
578 isSameReg(*Operand, *getReplacedOperand()));
579 copyRegOperand(*Operand, *getTargetOperand());
580 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
581 assert(DstSel);
582
583 SdwaSel ExistingSel = static_cast<SdwaSel>(DstSel->getImm());
584 DstSel->setImm(combineSdwaSel(ExistingSel, getDstSel()).value());
585
586 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
588 DstUnused->setImm(getDstUnused());
589
590 // Remove original instruction because it would conflict with our new
591 // instruction by register definition
592 getParentInst()->eraseFromParent();
593 return true;
594}
595
596bool SDWADstOperand::canCombineSelections(const MachineInstr &MI,
597 const SIInstrInfo *TII) {
598 if (!TII->isSDWA(MI.getOpcode()))
599 return true;
600
601 return canCombineOpSel(MI, TII, AMDGPU::OpName::dst_sel, getDstSel());
602}
603
604bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
605 const SIInstrInfo *TII) {
606 // MI should be moved right before v_or_b32.
607 // For this we should clear all kill flags on uses of MI src-operands or else
608 // we can encounter problem with use of killed operand.
609 for (MachineOperand &MO : MI.uses()) {
610 if (!MO.isReg())
611 continue;
612 getMRI()->clearKillFlags(MO.getReg());
613 }
614
615 // Move MI before v_or_b32
616 MI.getParent()->remove(&MI);
617 getParentInst()->getParent()->insert(getParentInst(), &MI);
618
619 // Add Implicit use of preserved register
620 MachineInstrBuilder MIB(*MI.getMF(), MI);
621 MIB.addReg(getPreservedOperand()->getReg(),
623 getPreservedOperand()->getSubReg());
624
625 // Tie dst to implicit use
626 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
627 MI.getNumOperands() - 1);
628
629 // Convert MI as any other SDWADstOperand and remove v_or_b32
630 return SDWADstOperand::convertToSDWA(MI, TII);
631}
632
633bool SDWADstPreserveOperand::canCombineSelections(const MachineInstr &MI,
634 const SIInstrInfo *TII) {
635 return SDWADstOperand::canCombineSelections(MI, TII);
636}
637
638std::optional<int64_t>
639SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
640 if (Op.isImm()) {
641 return Op.getImm();
642 }
643
644 // If this is not immediate then it can be copy of immediate value, e.g.:
645 // %1 = S_MOV_B32 255;
646 if (Op.isReg()) {
647 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
648 if (!isSameReg(Op, Def))
649 continue;
650
651 const MachineInstr *DefInst = Def.getParent();
652 if (!TII->isFoldableCopy(*DefInst))
653 return std::nullopt;
654
655 const MachineOperand &Copied = DefInst->getOperand(1);
656 if (!Copied.isImm())
657 return std::nullopt;
658
659 return Copied.getImm();
660 }
661 }
662
663 return std::nullopt;
664}
665
666std::unique_ptr<SDWAOperand>
667SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
668 unsigned Opcode = MI.getOpcode();
669 switch (Opcode) {
670 case AMDGPU::V_LSHRREV_B32_e32:
671 case AMDGPU::V_ASHRREV_I32_e32:
672 case AMDGPU::V_LSHLREV_B32_e32:
673 case AMDGPU::V_LSHRREV_B32_e64:
674 case AMDGPU::V_ASHRREV_I32_e64:
675 case AMDGPU::V_LSHLREV_B32_e64: {
676 // from: v_lshrrev_b32_e32 v1, 16/24, v0
677 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
678
679 // from: v_ashrrev_i32_e32 v1, 16/24, v0
680 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
681
682 // from: v_lshlrev_b32_e32 v1, 16/24, v0
683 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
684 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
685 auto Imm = foldToImm(*Src0);
686 if (!Imm)
687 break;
688
689 if (*Imm != 16 && *Imm != 24)
690 break;
691
692 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
693 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
694 if (!Src1->isReg() || Src1->getReg().isPhysical() ||
695 Dst->getReg().isPhysical())
696 break;
697
698 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
699 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
700 return std::make_unique<SDWADstOperand>(
701 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
702 }
703 return std::make_unique<SDWASrcOperand>(
704 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
705 Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
706 Opcode != AMDGPU::V_LSHRREV_B32_e64);
707 break;
708 }
709
710 case AMDGPU::V_LSHRREV_B16_e32:
711 case AMDGPU::V_ASHRREV_I16_e32:
712 case AMDGPU::V_LSHLREV_B16_e32:
713 case AMDGPU::V_LSHRREV_B16_e64:
714 case AMDGPU::V_LSHRREV_B16_opsel_e64:
715 case AMDGPU::V_ASHRREV_I16_e64:
716 case AMDGPU::V_LSHLREV_B16_opsel_e64:
717 case AMDGPU::V_LSHLREV_B16_e64: {
718 // from: v_lshrrev_b16_e32 v1, 8, v0
719 // to SDWA src:v0 src_sel:BYTE_1
720
721 // from: v_ashrrev_i16_e32 v1, 8, v0
722 // to SDWA src:v0 src_sel:BYTE_1 sext:1
723
724 // from: v_lshlrev_b16_e32 v1, 8, v0
725 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
726 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
727 auto Imm = foldToImm(*Src0);
728 if (!Imm || *Imm != 8)
729 break;
730
731 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
732 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
733
734 if (!Src1->isReg() || Src1->getReg().isPhysical() ||
735 Dst->getReg().isPhysical())
736 break;
737
738 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
739 Opcode == AMDGPU::V_LSHLREV_B16_opsel_e64 ||
740 Opcode == AMDGPU::V_LSHLREV_B16_e64)
741 return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
742 return std::make_unique<SDWASrcOperand>(
743 Src1, Dst, BYTE_1, false, false,
744 Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
745 Opcode != AMDGPU::V_LSHRREV_B16_opsel_e64 &&
746 Opcode != AMDGPU::V_LSHRREV_B16_e64);
747 break;
748 }
749
750 case AMDGPU::V_BFE_I32_e64:
751 case AMDGPU::V_BFE_U32_e64: {
752 // e.g.:
753 // from: v_bfe_u32 v1, v0, 8, 8
754 // to SDWA src:v0 src_sel:BYTE_1
755
756 // offset | width | src_sel
757 // ------------------------
758 // 0 | 8 | BYTE_0
759 // 0 | 16 | WORD_0
760 // 0 | 32 | DWORD ?
761 // 8 | 8 | BYTE_1
762 // 16 | 8 | BYTE_2
763 // 16 | 16 | WORD_1
764 // 24 | 8 | BYTE_3
765
766 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
767 auto Offset = foldToImm(*Src1);
768 if (!Offset)
769 break;
770
771 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
772 auto Width = foldToImm(*Src2);
773 if (!Width)
774 break;
775
776 SdwaSel SrcSel = DWORD;
777
778 if (*Offset == 0 && *Width == 8)
779 SrcSel = BYTE_0;
780 else if (*Offset == 0 && *Width == 16)
781 SrcSel = WORD_0;
782 else if (*Offset == 0 && *Width == 32)
783 SrcSel = DWORD;
784 else if (*Offset == 8 && *Width == 8)
785 SrcSel = BYTE_1;
786 else if (*Offset == 16 && *Width == 8)
787 SrcSel = BYTE_2;
788 else if (*Offset == 16 && *Width == 16)
789 SrcSel = WORD_1;
790 else if (*Offset == 24 && *Width == 8)
791 SrcSel = BYTE_3;
792 else
793 break;
794
795 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
796 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
797
798 if (!Src0->isReg() || Src0->getReg().isPhysical() ||
799 Dst->getReg().isPhysical())
800 break;
801
802 return std::make_unique<SDWASrcOperand>(
803 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64);
804 }
805
806 case AMDGPU::V_AND_B32_e32:
807 case AMDGPU::V_AND_B32_e64: {
808 // e.g.:
809 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
810 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
811
812 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
813 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
814 auto *ValSrc = Src1;
815 auto Imm = foldToImm(*Src0);
816
817 if (!Imm) {
818 Imm = foldToImm(*Src1);
819 ValSrc = Src0;
820 }
821
822 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
823 break;
824
825 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
826
827 if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
828 Dst->getReg().isPhysical())
829 break;
830
831 return std::make_unique<SDWASrcOperand>(
832 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
833 }
834
835 case AMDGPU::V_OR_B32_e32:
836 case AMDGPU::V_OR_B32_e64: {
837 // Patterns for dst_unused:UNUSED_PRESERVE.
838 // e.g., from:
839 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
840 // src1_sel:WORD_1 src2_sel:WORD1
841 // v_add_f16_e32 v3, v1, v2
842 // v_or_b32_e32 v4, v0, v3
843 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
844
845 // Check if one of operands of v_or_b32 is SDWA instruction
846 using CheckRetType =
847 std::optional<std::pair<MachineOperand *, MachineOperand *>>;
848 auto CheckOROperandsForSDWA =
849 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
850 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
851 return CheckRetType(std::nullopt);
852
853 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
854 if (!Op1Def)
855 return CheckRetType(std::nullopt);
856
857 MachineInstr *Op1Inst = Op1Def->getParent();
858 if (!TII->isSDWA(*Op1Inst))
859 return CheckRetType(std::nullopt);
860
861 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
862 if (!Op2Def)
863 return CheckRetType(std::nullopt);
864
865 return CheckRetType(std::pair(Op1Def, Op2Def));
866 };
867
868 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
869 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
870 assert(OrSDWA && OrOther);
871 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
872 if (!Res) {
873 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
874 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
875 assert(OrSDWA && OrOther);
876 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
877 if (!Res)
878 break;
879 }
880
881 MachineOperand *OrSDWADef = Res->first;
882 MachineOperand *OrOtherDef = Res->second;
883 assert(OrSDWADef && OrOtherDef);
884
885 MachineInstr *SDWAInst = OrSDWADef->getParent();
886 MachineInstr *OtherInst = OrOtherDef->getParent();
887
888 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
889 // destination patterns don't overlap. Compatible instruction can be either
890 // regular instruction with compatible bitness or SDWA instruction with
891 // correct dst_sel
892 // SDWAInst | OtherInst bitness / OtherInst dst_sel
893 // -----------------------------------------------------
894 // DWORD | no / no
895 // WORD_0 | no / BYTE_2/3, WORD_1
896 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
897 // BYTE_0 | no / BYTE_1/2/3, WORD_1
898 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
899 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
900 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
901 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
902 // but v_add_f32 is not.
903
904 // TODO: add support for non-SDWA instructions as OtherInst.
905 // For now this only works with SDWA instructions. For regular instructions
906 // there is no way to determine if the instruction writes only 8/16/24-bit
907 // out of full register size and all registers are at min 32-bit wide.
908 if (!TII->isSDWA(*OtherInst))
909 break;
910
911 SdwaSel DstSel = static_cast<SdwaSel>(
912 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));
913 SdwaSel OtherDstSel = static_cast<SdwaSel>(
914 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
915
916 bool DstSelAgree = false;
917 switch (DstSel) {
918 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
919 (OtherDstSel == BYTE_3) ||
920 (OtherDstSel == WORD_1));
921 break;
922 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
923 (OtherDstSel == BYTE_1) ||
924 (OtherDstSel == WORD_0));
925 break;
926 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
927 (OtherDstSel == BYTE_2) ||
928 (OtherDstSel == BYTE_3) ||
929 (OtherDstSel == WORD_1));
930 break;
931 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
932 (OtherDstSel == BYTE_2) ||
933 (OtherDstSel == BYTE_3) ||
934 (OtherDstSel == WORD_1));
935 break;
936 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
937 (OtherDstSel == BYTE_1) ||
938 (OtherDstSel == BYTE_3) ||
939 (OtherDstSel == WORD_0));
940 break;
941 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
942 (OtherDstSel == BYTE_1) ||
943 (OtherDstSel == BYTE_2) ||
944 (OtherDstSel == WORD_0));
945 break;
946 default: DstSelAgree = false;
947 }
948
949 if (!DstSelAgree)
950 break;
951
952 // Also OtherInst dst_unused should be UNUSED_PAD
953 DstUnused OtherDstUnused = static_cast<DstUnused>(
954 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
955 if (OtherDstUnused != DstUnused::UNUSED_PAD)
956 break;
957
958 // Create DstPreserveOperand
959 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
960 assert(OrDst && OrDst->isReg());
961
962 return std::make_unique<SDWADstPreserveOperand>(
963 OrDst, OrSDWADef, OrOtherDef, DstSel);
964
965 }
966 }
967
968 return std::unique_ptr<SDWAOperand>(nullptr);
969}
970
971#if !defined(NDEBUG)
972static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
973 Operand.print(OS);
974 return OS;
975}
976#endif
977
978void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
979 for (MachineInstr &MI : MBB) {
980 if (auto Operand = matchSDWAOperand(MI)) {
981 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
982 SDWAOperands[&MI] = std::move(Operand);
983 ++NumSDWAPatternsFound;
984 }
985 }
986}
987
988// Convert the V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows
989// isConvertibleToSDWA to perform its transformation on V_ADD_CO_U32_e32 into
990// V_ADD_CO_U32_sdwa.
991//
992// We are transforming from a VOP3 into a VOP2 form of the instruction.
993// %19:vgpr_32 = V_AND_B32_e32 255,
994// killed %16:vgpr_32, implicit $exec
995// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
996// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
997// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
998// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
999//
1000// becomes
1001// %47:vgpr_32 = V_ADD_CO_U32_sdwa
1002// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
1003// implicit-def $vcc, implicit $exec
1004// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
1005// %26.sub1:vreg_64, %54:vgpr_32, killed $vcc, implicit $exec
1006void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
1007 const GCNSubtarget &ST) const {
1008 int Opc = MI.getOpcode();
1009 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&
1010 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64");
1011
1012 // Can the candidate MI be shrunk?
1013 if (!TII->canShrink(MI, *MRI))
1014 return;
1016 // Find the related ADD instruction.
1017 const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
1018 if (!Sdst)
1019 return;
1020 MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
1021 if (!NextOp)
1022 return;
1023 MachineInstr &MISucc = *NextOp->getParent();
1024
1025 // Make sure the carry in/out are subsequently unused.
1026 MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
1027 if (!CarryIn)
1028 return;
1029 MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
1030 if (!CarryOut)
1031 return;
1032 if (!MRI->hasOneNonDBGUse(CarryIn->getReg()) ||
1033 !MRI->use_nodbg_empty(CarryOut->getReg()))
1034 return;
1035 // Make sure VCC or its subregs are dead before MI.
1036 MachineBasicBlock &MBB = *MI.getParent();
1038 MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
1039 if (Liveness != MachineBasicBlock::LQR_Dead)
1040 return;
1041 // Check if VCC is referenced in range of (MI,MISucc].
1042 for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
1043 I != E; ++I) {
1044 if (I->modifiesRegister(AMDGPU::VCC, TRI))
1045 return;
1046 }
1047
1048 // Replace MI with V_{SUB|ADD}_I32_e32
1049 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
1050 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
1051 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
1052 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
1053 .setMIFlags(MI.getFlags());
1054
1055 MI.eraseFromParent();
1056
1057 // Since the carry output of MI is now VCC, update its use in MISucc.
1058
1059 MISucc.substituteRegister(CarryIn->getReg(), TRI->getVCC(), 0, *TRI);
1060}
1061
1062/// Try to convert an \p MI in VOP3 which takes an src2 carry-in
1063/// operand into the corresponding VOP2 form which expects the
1064/// argument in VCC. To this end, add an copy from the carry-in to
1065/// VCC. The conversion will only be applied if \p MI can be shrunk
1066/// to VOP2 and if VCC can be proven to be dead before \p MI.
1067void SIPeepholeSDWA::convertVcndmaskToVOP2(MachineInstr &MI,
1068 const GCNSubtarget &ST) const {
1069 assert(MI.getOpcode() == AMDGPU::V_CNDMASK_B32_e64);
1070
1071 LLVM_DEBUG(dbgs() << "Attempting VOP2 conversion: " << MI);
1072 if (!TII->canShrink(MI, *MRI)) {
1073 LLVM_DEBUG(dbgs() << "Cannot shrink instruction\n");
1074 return;
1075 }
1076
1077 const MachineOperand &CarryIn =
1078 *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1079 Register CarryReg = CarryIn.getReg();
1080 MachineInstr *CarryDef = MRI->getVRegDef(CarryReg);
1081 if (!CarryDef) {
1082 LLVM_DEBUG(dbgs() << "Missing carry-in operand definition\n");
1083 return;
1084 }
1085
1086 // Make sure VCC or its subregs are dead before MI.
1087 MCRegister Vcc = TRI->getVCC();
1088 MachineBasicBlock &MBB = *MI.getParent();
1091 if (Liveness != MachineBasicBlock::LQR_Dead) {
1092 LLVM_DEBUG(dbgs() << "VCC not known to be dead before instruction\n");
1093 return;
1094 }
1095
1096 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY), Vcc).add(CarryIn);
1097
1098 auto Converted = BuildMI(MBB, MI, MI.getDebugLoc(),
1099 TII->get(AMDGPU::getVOPe32(MI.getOpcode())))
1100 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
1101 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
1102 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
1103 .setMIFlags(MI.getFlags());
1104 TII->fixImplicitOperands(*Converted);
1105 LLVM_DEBUG(dbgs() << "Converted to VOP2: " << *Converted);
1106 (void)Converted;
1107 MI.eraseFromParent();
1108}
1109
1110namespace {
1111bool isConvertibleToSDWA(MachineInstr &MI,
1112 const GCNSubtarget &ST,
1113 const SIInstrInfo* TII) {
1114 // Check if this is already an SDWA instruction
1115 unsigned Opc = MI.getOpcode();
1116 if (TII->isSDWA(Opc))
1117 return true;
1118
1119 // Can only be handled after ealier conversion to
1120 // AMDGPU::V_CNDMASK_B32_e32 which is not always possible.
1121 if (Opc == AMDGPU::V_CNDMASK_B32_e64)
1122 return false;
1123
1124 // Check if this instruction has opcode that supports SDWA
1125 if (AMDGPU::getSDWAOp(Opc) == -1)
1127
1128 if (AMDGPU::getSDWAOp(Opc) == -1)
1129 return false;
1130
1131 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
1132 return false;
1133
1134 if (TII->isVOPC(Opc)) {
1135 if (!ST.hasSDWASdst()) {
1136 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
1137 if (SDst && (SDst->getReg() != AMDGPU::VCC &&
1138 SDst->getReg() != AMDGPU::VCC_LO))
1139 return false;
1140 }
1141
1142 if (!ST.hasSDWAOutModsVOPC() &&
1143 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
1144 TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
1145 return false;
1146
1147 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
1148 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1149 return false;
1150 }
1151
1152 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
1153 Opc == AMDGPU::V_FMAC_F32_e32 ||
1154 Opc == AMDGPU::V_MAC_F16_e32 ||
1155 Opc == AMDGPU::V_MAC_F32_e32))
1156 return false;
1157
1158 // Check if target supports this SDWA opcode
1159 if (TII->pseudoToMCOpcode(Opc) == -1)
1160 return false;
1161
1162 if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
1163 if (!Src0->isReg() && !Src0->isImm())
1164 return false;
1165 }
1166
1167 if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {
1168 if (!Src1->isReg() && !Src1->isImm())
1169 return false;
1170 }
1171
1172 return true;
1173}
1174} // namespace
1175
1176MachineInstr *SIPeepholeSDWA::createSDWAVersion(MachineInstr &MI) {
1177 unsigned Opcode = MI.getOpcode();
1178 assert(!TII->isSDWA(Opcode));
1179
1180 int SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
1181 if (SDWAOpcode == -1)
1182 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
1183 assert(SDWAOpcode != -1);
1184
1185 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
1186
1187 // Create SDWA version of instruction MI and initialize its operands
1188 MachineInstrBuilder SDWAInst =
1189 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)
1190 .setMIFlags(MI.getFlags());
1191
1192 // Copy dst, if it is present in original then should also be present in SDWA
1193 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1194 if (Dst) {
1195 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::vdst));
1196 SDWAInst.add(*Dst);
1197 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
1198 assert(Dst && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1199 SDWAInst.add(*Dst);
1200 } else {
1201 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::sdst));
1202 SDWAInst.addReg(TRI->getVCC(), RegState::Define);
1203 }
1204
1205 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1206 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1207 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1208 assert(Src0 && AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0) &&
1209 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_modifiers));
1210 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1211 SDWAInst.addImm(Mod->getImm());
1212 else
1213 SDWAInst.addImm(0);
1214 SDWAInst.add(*Src0);
1215
1216 // Copy src1 if present, initialize src1_modifiers.
1217 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1218 if (Src1) {
1219 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1) &&
1220 AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_modifiers));
1221 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1222 SDWAInst.addImm(Mod->getImm());
1223 else
1224 SDWAInst.addImm(0);
1225 SDWAInst.add(*Src1);
1226 }
1227
1228 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1229 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1230 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1231 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1232 // v_mac_f16/32 has additional src2 operand tied to vdst
1233 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1234 assert(Src2);
1235 SDWAInst.add(*Src2);
1236 }
1237
1238 // Copy clamp if present, initialize otherwise
1239 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::clamp));
1240 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1241 if (Clamp) {
1242 SDWAInst.add(*Clamp);
1243 } else {
1244 SDWAInst.addImm(0);
1245 }
1246
1247 // Copy omod if present, initialize otherwise if needed
1248 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::omod)) {
1249 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1250 if (OMod) {
1251 SDWAInst.add(*OMod);
1252 } else {
1253 SDWAInst.addImm(0);
1254 }
1255 }
1256
1257 // Initialize SDWA specific operands
1258 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_sel))
1259 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1260
1261 if (AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::dst_unused))
1262 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1263
1264 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src0_sel));
1265 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1266
1267 if (Src1) {
1268 assert(AMDGPU::hasNamedOperand(SDWAOpcode, AMDGPU::OpName::src1_sel));
1269 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1270 }
1271
1272 // Check for a preserved register that needs to be copied.
1273 MachineInstr *Ret = SDWAInst.getInstr();
1274 TII->fixImplicitOperands(*Ret);
1275 return Ret;
1276}
1277
1278bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
1279 const SDWAOperandsVector &SDWAOperands) {
1280 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
1281
1282 MachineInstr *SDWAInst;
1283 if (TII->isSDWA(MI.getOpcode())) {
1284 // Clone the instruction to allow revoking changes
1285 // made to MI during the processing of the operands
1286 // if the conversion fails.
1287 SDWAInst = MI.getParent()->getParent()->CloneMachineInstr(&MI);
1288 MI.getParent()->insert(MI.getIterator(), SDWAInst);
1289 } else {
1290 SDWAInst = createSDWAVersion(MI);
1291 }
1292
1293 // Apply all sdwa operand patterns.
1294 bool Converted = false;
1295 for (auto &Operand : SDWAOperands) {
1296 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1297 // There should be no intersection between SDWA operands and potential MIs
1298 // e.g.:
1299 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1300 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1301 // v_add_u32 v3, v4, v2
1302 //
1303 // In that example it is possible that we would fold 2nd instruction into
1304 // 3rd (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that
1305 // was already destroyed). So if SDWAOperand is also a potential MI then do
1306 // not apply it.
1307 if (PotentialMatches.count(Operand->getParentInst()) == 0)
1308 Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1309 }
1310
1311 if (!Converted) {
1312 SDWAInst->eraseFromParent();
1313 return false;
1314 }
1315
1316 ConvertedInstructions.push_back(SDWAInst);
1317 for (MachineOperand &MO : SDWAInst->uses()) {
1318 if (!MO.isReg())
1319 continue;
1320
1321 MRI->clearKillFlags(MO.getReg());
1322 }
1323 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1324 ++NumSDWAInstructionsPeepholed;
1325
1326 MI.eraseFromParent();
1327 return true;
1328}
1329
1330// If an instruction was converted to SDWA it should not have immediates or SGPR
1331// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1332void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1333 const GCNSubtarget &ST) const {
1334 const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1335 unsigned ConstantBusCount = 0;
1336 for (MachineOperand &Op : MI.explicit_uses()) {
1337 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1338 continue;
1339
1340 unsigned I = Op.getOperandNo();
1341 if (Desc.operands()[I].RegClass == -1 ||
1342 !TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass)))
1343 continue;
1344
1345 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1346 TRI->isSGPRReg(*MRI, Op.getReg())) {
1347 ++ConstantBusCount;
1348 continue;
1349 }
1350
1351 Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1352 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1353 TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1354 if (Op.isImm())
1355 Copy.addImm(Op.getImm());
1356 else if (Op.isReg())
1357 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1358 Op.getSubReg());
1359 Op.ChangeToRegister(VGPR, false);
1360 }
1361}
1362
1363bool SIPeepholeSDWALegacy::runOnMachineFunction(MachineFunction &MF) {
1364 if (skipFunction(MF.getFunction()))
1365 return false;
1366
1367 return SIPeepholeSDWA().run(MF);
1368}
1369
1370bool SIPeepholeSDWA::run(MachineFunction &MF) {
1371 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1372
1373 if (!ST.hasSDWA())
1374 return false;
1375
1376 MRI = &MF.getRegInfo();
1377 TRI = ST.getRegisterInfo();
1378 TII = ST.getInstrInfo();
1379
1380 // Find all SDWA operands in MF.
1381 bool Ret = false;
1382 for (MachineBasicBlock &MBB : MF) {
1383 bool Changed = false;
1384 do {
1385 // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1386 // Look for a possible ADD or SUB that resulted from a previously lowered
1387 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1388 // lowers the pair of instructions into e32 form.
1389 matchSDWAOperands(MBB);
1390 for (const auto &OperandPair : SDWAOperands) {
1391 const auto &Operand = OperandPair.second;
1392 MachineInstr *PotentialMI = Operand->potentialToConvert(TII, ST);
1393 if (!PotentialMI)
1394 continue;
1395
1396 switch (PotentialMI->getOpcode()) {
1397 case AMDGPU::V_ADD_CO_U32_e64:
1398 case AMDGPU::V_SUB_CO_U32_e64:
1399 pseudoOpConvertToVOP2(*PotentialMI, ST);
1400 break;
1401 case AMDGPU::V_CNDMASK_B32_e64:
1402 convertVcndmaskToVOP2(*PotentialMI, ST);
1403 break;
1404 };
1405 }
1406 SDWAOperands.clear();
1407
1408 // Generate potential match list.
1409 matchSDWAOperands(MBB);
1410
1411 for (const auto &OperandPair : SDWAOperands) {
1412 const auto &Operand = OperandPair.second;
1413 MachineInstr *PotentialMI =
1414 Operand->potentialToConvert(TII, ST, &PotentialMatches);
1415
1416 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST, TII))
1417 PotentialMatches[PotentialMI].push_back(Operand.get());
1418 }
1419
1420 for (auto &PotentialPair : PotentialMatches) {
1421 MachineInstr &PotentialMI = *PotentialPair.first;
1422 convertToSDWA(PotentialMI, PotentialPair.second);
1423 }
1424
1425 PotentialMatches.clear();
1426 SDWAOperands.clear();
1427
1428 Changed = !ConvertedInstructions.empty();
1429
1430 if (Changed)
1431 Ret = true;
1432 while (!ConvertedInstructions.empty())
1433 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1434 } while (Changed);
1435 }
1436
1437 return Ret;
1438}
1439
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock & MBB
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition Compiler.h:638
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:58
Register Reg
Register const TargetRegisterInfo * TRI
This file implements a map that provides insertion order iteration.
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static MachineOperand * findSingleRegDef(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
static void copyRegOperand(MachineOperand &To, const MachineOperand &From)
static MachineOperand * findSingleRegUse(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
static std::optional< SdwaSel > combineSdwaSel(SdwaSel Sel, SdwaSel OperandSel)
Combine an SDWA instruction's existing SDWA selection Sel with the SDWA selection OperandSel of its o...
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS)
static bool canCombineOpSel(const MachineInstr &MI, const SIInstrInfo *TII, AMDGPU::OpName SrcSelOpName, SdwaSel OpSel)
Verify that the SDWA selection operand SrcSelOpName of the SDWA instruction MI can be combined with t...
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
Value * RHS
Value * LHS
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasOptNone() const
Do not optimize this function (-O0).
Definition Function.h:700
LLVM_ABI LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
LivenessQueryResult
Possible outcome of a register liveness query to computeRegisterLiveness()
@ LQR_Dead
Register is known to be fully dead.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
LLVM_ABI void substituteRegister(Register FromReg, Register ToReg, unsigned SubIdx, const TargetRegisterInfo &RegInfo)
Replace all occurrences of FromReg with ToReg:SubIdx, properly composing subreg indices where necessa...
mop_range uses()
Returns all operands which may be register uses.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
void setImm(int64_t immVal)
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:78
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
self_iterator getIterator()
Definition ilist_node.h:123
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition raw_ostream.h:53
Changed
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READONLY int getSDWAOp(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Define
Register definition.
@ Kill
The last use of a register.
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
@ Offset
Definition DWP.cpp:477
Printable print(const GCNRegPressure &RP, const GCNSubtarget *ST=nullptr, unsigned DynamicVGPRBlockSize=0)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
Op::Description Desc
FunctionPass * createSIPeepholeSDWALegacyPass()
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
DWARFExpression::Operation Op
raw_ostream & operator<<(raw_ostream &OS, const APFixedPoint &FX)
char & SIPeepholeSDWALegacyID