LLVM  7.0.0svn
SIPeepholeSDWA.cpp
Go to the documentation of this file.
1 //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file This pass tries to apply several peephole SDWA patterns.
11 ///
12 /// E.g. original:
13 /// V_LSHRREV_B32_e32 %0, 16, %1
14 /// V_ADD_I32_e32 %2, %0, %3
15 /// V_LSHLREV_B32_e32 %4, 16, %2
16 ///
17 /// Replace:
18 /// V_ADD_I32_sdwa %4, %1, %3
19 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
20 ///
21 //===----------------------------------------------------------------------===//
22 
23 #include "AMDGPU.h"
24 #include "AMDGPUSubtarget.h"
25 #include "SIDefines.h"
26 #include "SIInstrInfo.h"
27 #include "SIRegisterInfo.h"
29 #include "Utils/AMDGPUBaseInfo.h"
30 #include "llvm/ADT/None.h"
31 #include "llvm/ADT/Optional.h"
32 #include "llvm/ADT/STLExtras.h"
33 #include "llvm/ADT/SmallVector.h"
34 #include "llvm/ADT/Statistic.h"
43 #include "llvm/Config/llvm-config.h"
44 #include "llvm/MC/LaneBitmask.h"
45 #include "llvm/MC/MCInstrDesc.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/Debug.h"
49 #include <algorithm>
50 #include <cassert>
51 #include <cstdint>
52 #include <memory>
53 #include <unordered_map>
54 
55 using namespace llvm;
56 
57 #define DEBUG_TYPE "si-peephole-sdwa"
58 
59 STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
60 STATISTIC(NumSDWAInstructionsPeepholed,
61  "Number of instruction converted to SDWA.");
62 
63 namespace {
64 
65 class SDWAOperand;
66 class SDWADstOperand;
67 
68 class SIPeepholeSDWA : public MachineFunctionPass {
69 public:
70  using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
71 
72 private:
74  const SIRegisterInfo *TRI;
75  const SIInstrInfo *TII;
76 
77  std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
78  std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
79  SmallVector<MachineInstr *, 8> ConvertedInstructions;
80 
81  Optional<int64_t> foldToImm(const MachineOperand &Op) const;
82 
83 public:
84  static char ID;
85 
86  SIPeepholeSDWA() : MachineFunctionPass(ID) {
88  }
89 
90  bool runOnMachineFunction(MachineFunction &MF) override;
91  void matchSDWAOperands(MachineBasicBlock &MBB);
92  std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
93  bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
94  bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
95  void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
96 
97  StringRef getPassName() const override { return "SI Peephole SDWA"; }
98 
99  void getAnalysisUsage(AnalysisUsage &AU) const override {
100  AU.setPreservesCFG();
102  }
103 };
104 
105 class SDWAOperand {
106 private:
107  MachineOperand *Target; // Operand that would be used in converted instruction
108  MachineOperand *Replaced; // Operand that would be replace by Target
109 
110 public:
111  SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
112  : Target(TargetOp), Replaced(ReplacedOp) {
113  assert(Target->isReg());
114  assert(Replaced->isReg());
115  }
116 
117  virtual ~SDWAOperand() = default;
118 
119  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
120  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
121 
122  MachineOperand *getTargetOperand() const { return Target; }
123  MachineOperand *getReplacedOperand() const { return Replaced; }
124  MachineInstr *getParentInst() const { return Target->getParent(); }
125 
126  MachineRegisterInfo *getMRI() const {
127  return &getParentInst()->getParent()->getParent()->getRegInfo();
128  }
129 
130 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
131  virtual void print(raw_ostream& OS) const = 0;
132  void dump() const { print(dbgs()); }
133 #endif
134 };
135 
136 using namespace AMDGPU::SDWA;
137 
138 class SDWASrcOperand : public SDWAOperand {
139 private:
140  SdwaSel SrcSel;
141  bool Abs;
142  bool Neg;
143  bool Sext;
144 
145 public:
146  SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
147  SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
148  bool Sext_ = false)
149  : SDWAOperand(TargetOp, ReplacedOp),
150  SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
151 
152  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
153  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
154 
155  SdwaSel getSrcSel() const { return SrcSel; }
156  bool getAbs() const { return Abs; }
157  bool getNeg() const { return Neg; }
158  bool getSext() const { return Sext; }
159 
160  uint64_t getSrcMods(const SIInstrInfo *TII,
161  const MachineOperand *SrcOp) const;
162 
163 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
164  void print(raw_ostream& OS) const override;
165 #endif
166 };
167 
168 class SDWADstOperand : public SDWAOperand {
169 private:
170  SdwaSel DstSel;
171  DstUnused DstUn;
172 
173 public:
174 
175  SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
176  SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
177  : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
178 
179  MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
180  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
181 
182  SdwaSel getDstSel() const { return DstSel; }
183  DstUnused getDstUnused() const { return DstUn; }
184 
185 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
186  void print(raw_ostream& OS) const override;
187 #endif
188 };
189 
190 class SDWADstPreserveOperand : public SDWADstOperand {
191 private:
192  MachineOperand *Preserve;
193 
194 public:
195  SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
196  MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
197  : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
198  Preserve(PreserveOp) {}
199 
200  bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
201 
202  MachineOperand *getPreservedOperand() const { return Preserve; }
203 
204 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
205  void print(raw_ostream& OS) const override;
206 #endif
207 };
208 
209 } // end anonymous namespace
210 
211 INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
212 
213 char SIPeepholeSDWA::ID = 0;
214 
215 char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
216 
218  return new SIPeepholeSDWA();
219 }
220 
221 
222 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
224  switch(Sel) {
225  case BYTE_0: OS << "BYTE_0"; break;
226  case BYTE_1: OS << "BYTE_1"; break;
227  case BYTE_2: OS << "BYTE_2"; break;
228  case BYTE_3: OS << "BYTE_3"; break;
229  case WORD_0: OS << "WORD_0"; break;
230  case WORD_1: OS << "WORD_1"; break;
231  case DWORD: OS << "DWORD"; break;
232  }
233  return OS;
234 }
235 
236 static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
237  switch(Un) {
238  case UNUSED_PAD: OS << "UNUSED_PAD"; break;
239  case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
240  case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
241  }
242  return OS;
243 }
244 
245 static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
246  Operand.print(OS);
247  return OS;
248 }
249 
251 void SDWASrcOperand::print(raw_ostream& OS) const {
252  OS << "SDWA src: " << *getTargetOperand()
253  << " src_sel:" << getSrcSel()
254  << " abs:" << getAbs() << " neg:" << getNeg()
255  << " sext:" << getSext() << '\n';
256 }
257 
259 void SDWADstOperand::print(raw_ostream& OS) const {
260  OS << "SDWA dst: " << *getTargetOperand()
261  << " dst_sel:" << getDstSel()
262  << " dst_unused:" << getDstUnused() << '\n';
263 }
264 
267  OS << "SDWA preserve dst: " << *getTargetOperand()
268  << " dst_sel:" << getDstSel()
269  << " preserve:" << *getPreservedOperand() << '\n';
270 }
271 
272 #endif
273 
274 static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
275  assert(To.isReg() && From.isReg());
276  To.setReg(From.getReg());
277  To.setSubReg(From.getSubReg());
278  To.setIsUndef(From.isUndef());
279  if (To.isUse()) {
280  To.setIsKill(From.isKill());
281  } else {
282  To.setIsDead(From.isDead());
283  }
284 }
285 
286 static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
287  return LHS.isReg() &&
288  RHS.isReg() &&
289  LHS.getReg() == RHS.getReg() &&
290  LHS.getSubReg() == RHS.getSubReg();
291 }
292 
294  const MachineRegisterInfo *MRI) {
295  if (!Reg->isReg() || !Reg->isDef())
296  return nullptr;
297 
298  MachineOperand *ResMO = nullptr;
299  for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
300  // If there exist use of subreg of Reg then return nullptr
301  if (!isSameReg(UseMO, *Reg))
302  return nullptr;
303 
304  // Check that there is only one instruction that uses Reg
305  if (!ResMO) {
306  ResMO = &UseMO;
307  } else if (ResMO->getParent() != UseMO.getParent()) {
308  return nullptr;
309  }
310  }
311 
312  return ResMO;
313 }
314 
316  const MachineRegisterInfo *MRI) {
317  if (!Reg->isReg())
318  return nullptr;
319 
320  MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
321  if (!DefInstr)
322  return nullptr;
323 
324  for (auto &DefMO : DefInstr->defs()) {
325  if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
326  return &DefMO;
327  }
328 
329  // Ignore implicit defs.
330  return nullptr;
331 }
332 
333 uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
334  const MachineOperand *SrcOp) const {
335  uint64_t Mods = 0;
336  const auto *MI = SrcOp->getParent();
337  if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
338  if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
339  Mods = Mod->getImm();
340  }
341  } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
342  if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
343  Mods = Mod->getImm();
344  }
345  }
346  if (Abs || Neg) {
347  assert(!Sext &&
348  "Float and integer src modifiers can't be set simulteniously");
349  Mods |= Abs ? SISrcMods::ABS : 0;
350  Mods ^= Neg ? SISrcMods::NEG : 0;
351  } else if (Sext) {
352  Mods |= SISrcMods::SEXT;
353  }
354 
355  return Mods;
356 }
357 
358 MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
359  // For SDWA src operand potential instruction is one that use register
360  // defined by parent instruction
361  MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
362  if (!PotentialMO)
363  return nullptr;
364 
365  return PotentialMO->getParent();
366 }
367 
368 bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
369  // Find operand in instruction that matches source operand and replace it with
370  // target operand. Set corresponding src_sel
371  bool IsPreserveSrc = false;
372  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
373  MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
374  MachineOperand *SrcMods =
375  TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
376  assert(Src && (Src->isReg() || Src->isImm()));
377  if (!isSameReg(*Src, *getReplacedOperand())) {
378  // If this is not src0 then it could be src1
379  Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
380  SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
381  SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
382 
383  if (!Src ||
384  !isSameReg(*Src, *getReplacedOperand())) {
385  // It's possible this Src is a tied operand for
386  // UNUSED_PRESERVE, in which case we can either
387  // abandon the peephole attempt, or if legal we can
388  // copy the target operand into the tied slot
389  // if the preserve operation will effectively cause the same
390  // result by overwriting the rest of the dst.
391  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
393  TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
394 
395  if (Dst &&
397  // This will work if the tied src is acessing WORD_0, and the dst is
398  // writing WORD_1. Modifiers don't matter because all the bits that
399  // would be impacted are being overwritten by the dst.
400  // Any other case will not work.
401  SdwaSel DstSel = static_cast<SdwaSel>(
402  TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
403  if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
404  getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
405  IsPreserveSrc = true;
406  auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
407  AMDGPU::OpName::vdst);
408  auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
409  Src = &MI.getOperand(TiedIdx);
410  SrcSel = nullptr;
411  SrcMods = nullptr;
412  } else {
413  // Not legal to convert this src
414  return false;
415  }
416  }
417  }
418  assert(Src && Src->isReg());
419 
420  if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
421  MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
422  !isSameReg(*Src, *getReplacedOperand())) {
423  // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
424  // src2. This is not allowed.
425  return false;
426  }
427 
428  assert(isSameReg(*Src, *getReplacedOperand()) &&
429  (IsPreserveSrc || (SrcSel && SrcMods)));
430  }
431  copyRegOperand(*Src, *getTargetOperand());
432  if (!IsPreserveSrc) {
433  SrcSel->setImm(getSrcSel());
434  SrcMods->setImm(getSrcMods(TII, Src));
435  }
436  getTargetOperand()->setIsKill(false);
437  return true;
438 }
439 
440 MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
441  // For SDWA dst operand potential instruction is one that defines register
442  // that this operand uses
443  MachineRegisterInfo *MRI = getMRI();
444  MachineInstr *ParentMI = getParentInst();
445 
446  MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
447  if (!PotentialMO)
448  return nullptr;
449 
450  // Check that ParentMI is the only instruction that uses replaced register
451  for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
452  if (&UseInst != ParentMI)
453  return nullptr;
454  }
455 
456  return PotentialMO->getParent();
457 }
458 
459 bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
460  // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
461 
462  if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
463  MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
464  getDstSel() != AMDGPU::SDWA::DWORD) {
465  // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
466  return false;
467  }
468 
469  MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
470  assert(Operand &&
471  Operand->isReg() &&
472  isSameReg(*Operand, *getReplacedOperand()));
473  copyRegOperand(*Operand, *getTargetOperand());
474  MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
475  assert(DstSel);
476  DstSel->setImm(getDstSel());
477  MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
478  assert(DstUnused);
479  DstUnused->setImm(getDstUnused());
480 
481  // Remove original instruction because it would conflict with our new
482  // instruction by register definition
483  getParentInst()->eraseFromParent();
484  return true;
485 }
486 
487 bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
488  const SIInstrInfo *TII) {
489  // MI should be moved right before v_or_b32.
490  // For this we should clear all kill flags on uses of MI src-operands or else
491  // we can encounter problem with use of killed operand.
492  for (MachineOperand &MO : MI.uses()) {
493  if (!MO.isReg())
494  continue;
495  getMRI()->clearKillFlags(MO.getReg());
496  }
497 
498  // Move MI before v_or_b32
499  auto MBB = MI.getParent();
500  MBB->remove(&MI);
501  MBB->insert(getParentInst(), &MI);
502 
503  // Add Implicit use of preserved register
504  MachineInstrBuilder MIB(*MBB->getParent(), MI);
505  MIB.addReg(getPreservedOperand()->getReg(),
507  getPreservedOperand()->getSubReg());
508 
509  // Tie dst to implicit use
510  MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
511  MI.getNumOperands() - 1);
512 
513  // Convert MI as any other SDWADstOperand and remove v_or_b32
514  return SDWADstOperand::convertToSDWA(MI, TII);
515 }
516 
517 Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
518  if (Op.isImm()) {
519  return Op.getImm();
520  }
521 
522  // If this is not immediate then it can be copy of immediate value, e.g.:
523  // %1 = S_MOV_B32 255;
524  if (Op.isReg()) {
525  for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
526  if (!isSameReg(Op, Def))
527  continue;
528 
529  const MachineInstr *DefInst = Def.getParent();
530  if (!TII->isFoldableCopy(*DefInst))
531  return None;
532 
533  const MachineOperand &Copied = DefInst->getOperand(1);
534  if (!Copied.isImm())
535  return None;
536 
537  return Copied.getImm();
538  }
539  }
540 
541  return None;
542 }
543 
544 std::unique_ptr<SDWAOperand>
545 SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
546  unsigned Opcode = MI.getOpcode();
547  switch (Opcode) {
548  case AMDGPU::V_LSHRREV_B32_e32:
549  case AMDGPU::V_ASHRREV_I32_e32:
550  case AMDGPU::V_LSHLREV_B32_e32:
551  case AMDGPU::V_LSHRREV_B32_e64:
552  case AMDGPU::V_ASHRREV_I32_e64:
553  case AMDGPU::V_LSHLREV_B32_e64: {
554  // from: v_lshrrev_b32_e32 v1, 16/24, v0
555  // to SDWA src:v0 src_sel:WORD_1/BYTE_3
556 
557  // from: v_ashrrev_i32_e32 v1, 16/24, v0
558  // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
559 
560  // from: v_lshlrev_b32_e32 v1, 16/24, v0
561  // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
562  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
563  auto Imm = foldToImm(*Src0);
564  if (!Imm)
565  break;
566 
567  if (*Imm != 16 && *Imm != 24)
568  break;
569 
570  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
571  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
572  if (TRI->isPhysicalRegister(Src1->getReg()) ||
573  TRI->isPhysicalRegister(Dst->getReg()))
574  break;
575 
576  if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
577  Opcode == AMDGPU::V_LSHLREV_B32_e64) {
578  return make_unique<SDWADstOperand>(
579  Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
580  } else {
581  return make_unique<SDWASrcOperand>(
582  Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
583  Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
584  Opcode != AMDGPU::V_LSHRREV_B32_e64);
585  }
586  break;
587  }
588 
589  case AMDGPU::V_LSHRREV_B16_e32:
590  case AMDGPU::V_ASHRREV_I16_e32:
591  case AMDGPU::V_LSHLREV_B16_e32:
592  case AMDGPU::V_LSHRREV_B16_e64:
593  case AMDGPU::V_ASHRREV_I16_e64:
594  case AMDGPU::V_LSHLREV_B16_e64: {
595  // from: v_lshrrev_b16_e32 v1, 8, v0
596  // to SDWA src:v0 src_sel:BYTE_1
597 
598  // from: v_ashrrev_i16_e32 v1, 8, v0
599  // to SDWA src:v0 src_sel:BYTE_1 sext:1
600 
601  // from: v_lshlrev_b16_e32 v1, 8, v0
602  // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
603  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
604  auto Imm = foldToImm(*Src0);
605  if (!Imm || *Imm != 8)
606  break;
607 
608  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
609  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
610 
611  if (TRI->isPhysicalRegister(Src1->getReg()) ||
612  TRI->isPhysicalRegister(Dst->getReg()))
613  break;
614 
615  if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
616  Opcode == AMDGPU::V_LSHLREV_B16_e64) {
617  return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
618  } else {
619  return make_unique<SDWASrcOperand>(
620  Src1, Dst, BYTE_1, false, false,
621  Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
622  Opcode != AMDGPU::V_LSHRREV_B16_e64);
623  }
624  break;
625  }
626 
627  case AMDGPU::V_BFE_I32:
628  case AMDGPU::V_BFE_U32: {
629  // e.g.:
630  // from: v_bfe_u32 v1, v0, 8, 8
631  // to SDWA src:v0 src_sel:BYTE_1
632 
633  // offset | width | src_sel
634  // ------------------------
635  // 0 | 8 | BYTE_0
636  // 0 | 16 | WORD_0
637  // 0 | 32 | DWORD ?
638  // 8 | 8 | BYTE_1
639  // 16 | 8 | BYTE_2
640  // 16 | 16 | WORD_1
641  // 24 | 8 | BYTE_3
642 
643  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
644  auto Offset = foldToImm(*Src1);
645  if (!Offset)
646  break;
647 
648  MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
649  auto Width = foldToImm(*Src2);
650  if (!Width)
651  break;
652 
653  SdwaSel SrcSel = DWORD;
654 
655  if (*Offset == 0 && *Width == 8)
656  SrcSel = BYTE_0;
657  else if (*Offset == 0 && *Width == 16)
658  SrcSel = WORD_0;
659  else if (*Offset == 0 && *Width == 32)
660  SrcSel = DWORD;
661  else if (*Offset == 8 && *Width == 8)
662  SrcSel = BYTE_1;
663  else if (*Offset == 16 && *Width == 8)
664  SrcSel = BYTE_2;
665  else if (*Offset == 16 && *Width == 16)
666  SrcSel = WORD_1;
667  else if (*Offset == 24 && *Width == 8)
668  SrcSel = BYTE_3;
669  else
670  break;
671 
672  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
673  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
674 
675  if (TRI->isPhysicalRegister(Src0->getReg()) ||
676  TRI->isPhysicalRegister(Dst->getReg()))
677  break;
678 
679  return make_unique<SDWASrcOperand>(
680  Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
681  }
682 
683  case AMDGPU::V_AND_B32_e32:
684  case AMDGPU::V_AND_B32_e64: {
685  // e.g.:
686  // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
687  // to SDWA src:v0 src_sel:WORD_0/BYTE_0
688 
689  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
690  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
691  auto ValSrc = Src1;
692  auto Imm = foldToImm(*Src0);
693 
694  if (!Imm) {
695  Imm = foldToImm(*Src1);
696  ValSrc = Src0;
697  }
698 
699  if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
700  break;
701 
702  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
703 
704  if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
705  TRI->isPhysicalRegister(Dst->getReg()))
706  break;
707 
708  return make_unique<SDWASrcOperand>(
709  ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
710  }
711 
712  case AMDGPU::V_OR_B32_e32:
713  case AMDGPU::V_OR_B32_e64: {
714  // Patterns for dst_unused:UNUSED_PRESERVE.
715  // e.g., from:
716  // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
717  // src1_sel:WORD_1 src2_sel:WORD1
718  // v_add_f16_e32 v3, v1, v2
719  // v_or_b32_e32 v4, v0, v3
720  // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
721 
722  // Check if one of operands of v_or_b32 is SDWA instruction
724  auto CheckOROperandsForSDWA =
725  [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
726  if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
727  return CheckRetType(None);
728 
729  MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
730  if (!Op1Def)
731  return CheckRetType(None);
732 
733  MachineInstr *Op1Inst = Op1Def->getParent();
734  if (!TII->isSDWA(*Op1Inst))
735  return CheckRetType(None);
736 
737  MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
738  if (!Op2Def)
739  return CheckRetType(None);
740 
741  return CheckRetType(std::make_pair(Op1Def, Op2Def));
742  };
743 
744  MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
745  MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
746  assert(OrSDWA && OrOther);
747  auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
748  if (!Res) {
749  OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
750  OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
751  assert(OrSDWA && OrOther);
752  Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
753  if (!Res)
754  break;
755  }
756 
757  MachineOperand *OrSDWADef = Res->first;
758  MachineOperand *OrOtherDef = Res->second;
759  assert(OrSDWADef && OrOtherDef);
760 
761  MachineInstr *SDWAInst = OrSDWADef->getParent();
762  MachineInstr *OtherInst = OrOtherDef->getParent();
763 
764  // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
765  // destination patterns don't overlap. Compatible instruction can be either
766  // regular instruction with compatible bitness or SDWA instruction with
767  // correct dst_sel
768  // SDWAInst | OtherInst bitness / OtherInst dst_sel
769  // -----------------------------------------------------
770  // DWORD | no / no
771  // WORD_0 | no / BYTE_2/3, WORD_1
772  // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
773  // BYTE_0 | no / BYTE_1/2/3, WORD_1
774  // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
775  // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
776  // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
777  // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
778  // but v_add_f32 is not.
779 
780  // TODO: add support for non-SDWA instructions as OtherInst.
781  // For now this only works with SDWA instructions. For regular instructions
782  // there is no way to determine if the instruction writes only 8/16/24-bit
783  // out of full register size and all registers are at min 32-bit wide.
784  if (!TII->isSDWA(*OtherInst))
785  break;
786 
787  SdwaSel DstSel = static_cast<SdwaSel>(
788  TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
789  SdwaSel OtherDstSel = static_cast<SdwaSel>(
790  TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
791 
792  bool DstSelAgree = false;
793  switch (DstSel) {
794  case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
795  (OtherDstSel == BYTE_3) ||
796  (OtherDstSel == WORD_1));
797  break;
798  case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
799  (OtherDstSel == BYTE_1) ||
800  (OtherDstSel == WORD_0));
801  break;
802  case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
803  (OtherDstSel == BYTE_2) ||
804  (OtherDstSel == BYTE_3) ||
805  (OtherDstSel == WORD_1));
806  break;
807  case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
808  (OtherDstSel == BYTE_2) ||
809  (OtherDstSel == BYTE_3) ||
810  (OtherDstSel == WORD_1));
811  break;
812  case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
813  (OtherDstSel == BYTE_1) ||
814  (OtherDstSel == BYTE_3) ||
815  (OtherDstSel == WORD_0));
816  break;
817  case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
818  (OtherDstSel == BYTE_1) ||
819  (OtherDstSel == BYTE_2) ||
820  (OtherDstSel == WORD_0));
821  break;
822  default: DstSelAgree = false;
823  }
824 
825  if (!DstSelAgree)
826  break;
827 
828  // Also OtherInst dst_unused should be UNUSED_PAD
829  DstUnused OtherDstUnused = static_cast<DstUnused>(
830  TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
831  if (OtherDstUnused != DstUnused::UNUSED_PAD)
832  break;
833 
834  // Create DstPreserveOperand
835  MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
836  assert(OrDst && OrDst->isReg());
837 
838  return make_unique<SDWADstPreserveOperand>(
839  OrDst, OrSDWADef, OrOtherDef, DstSel);
840 
841  }
842  }
843 
844  return std::unique_ptr<SDWAOperand>(nullptr);
845 }
846 
847 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
848  for (MachineInstr &MI : MBB) {
849  if (auto Operand = matchSDWAOperand(MI)) {
850  LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n');
851  SDWAOperands[&MI] = std::move(Operand);
852  ++NumSDWAPatternsFound;
853  }
854  }
855 }
856 
857 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
858  const SISubtarget &ST) const {
859  // Check if this is already an SDWA instruction
860  unsigned Opc = MI.getOpcode();
861  if (TII->isSDWA(Opc))
862  return true;
863 
864  // Check if this instruction has opcode that supports SDWA
865  if (AMDGPU::getSDWAOp(Opc) == -1)
866  Opc = AMDGPU::getVOPe32(Opc);
867 
868  if (AMDGPU::getSDWAOp(Opc) == -1)
869  return false;
870 
871  if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
872  return false;
873 
874  if (TII->isVOPC(Opc)) {
875  if (!ST.hasSDWASdst()) {
876  const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
877  if (SDst && SDst->getReg() != AMDGPU::VCC)
878  return false;
879  }
880 
881  if (!ST.hasSDWAOutModsVOPC() &&
882  (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
883  TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
884  return false;
885 
886  } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
887  !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
888  return false;
889  }
890 
891  if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
892  Opc == AMDGPU::V_MAC_F32_e32))
893  return false;
894 
895  // FIXME: has SDWA but require handling of implicit VCC use
896  if (Opc == AMDGPU::V_CNDMASK_B32_e32)
897  return false;
898 
899  return true;
900 }
901 
902 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
903  const SDWAOperandsVector &SDWAOperands) {
904 
905  LLVM_DEBUG(dbgs() << "Convert instruction:" << MI);
906 
907  // Convert to sdwa
908  int SDWAOpcode;
909  unsigned Opcode = MI.getOpcode();
910  if (TII->isSDWA(Opcode)) {
911  SDWAOpcode = Opcode;
912  } else {
913  SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
914  if (SDWAOpcode == -1)
915  SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
916  }
917  assert(SDWAOpcode != -1);
918 
919  const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
920 
921  // Create SDWA version of instruction MI and initialize its operands
922  MachineInstrBuilder SDWAInst =
923  BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
924 
925  // Copy dst, if it is present in original then should also be present in SDWA
926  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
927  if (Dst) {
928  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
929  SDWAInst.add(*Dst);
930  } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
931  assert(Dst &&
932  AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
933  SDWAInst.add(*Dst);
934  } else {
935  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1);
936  SDWAInst.addReg(AMDGPU::VCC, RegState::Define);
937  }
938 
939  // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
940  // src0_modifiers (except for v_nop_sdwa, but it can't get here)
941  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
942  assert(
943  Src0 &&
944  AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
945  AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
946  if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
947  SDWAInst.addImm(Mod->getImm());
948  else
949  SDWAInst.addImm(0);
950  SDWAInst.add(*Src0);
951 
952  // Copy src1 if present, initialize src1_modifiers.
953  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
954  if (Src1) {
955  assert(
956  AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
957  AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
958  if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
959  SDWAInst.addImm(Mod->getImm());
960  else
961  SDWAInst.addImm(0);
962  SDWAInst.add(*Src1);
963  }
964 
965  if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
966  SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
967  // v_mac_f16/32 has additional src2 operand tied to vdst
968  MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
969  assert(Src2);
970  SDWAInst.add(*Src2);
971  }
972 
973  // Copy clamp if present, initialize otherwise
974  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
975  MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
976  if (Clamp) {
977  SDWAInst.add(*Clamp);
978  } else {
979  SDWAInst.addImm(0);
980  }
981 
982  // Copy omod if present, initialize otherwise if needed
983  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
984  MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
985  if (OMod) {
986  SDWAInst.add(*OMod);
987  } else {
988  SDWAInst.addImm(0);
989  }
990  }
991 
992  // Copy dst_sel if present, initialize otherwise if needed
993  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
994  MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
995  if (DstSel) {
996  SDWAInst.add(*DstSel);
997  } else {
999  }
1000  }
1001 
1002  // Copy dst_unused if present, initialize otherwise if needed
1003  if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
1004  MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1005  if (DstUnused) {
1006  SDWAInst.add(*DstUnused);
1007  } else {
1009  }
1010  }
1011 
1012  // Copy src0_sel if present, initialize otherwise
1013  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
1014  MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
1015  if (Src0Sel) {
1016  SDWAInst.add(*Src0Sel);
1017  } else {
1019  }
1020 
1021  // Copy src1_sel if present, initialize otherwise if needed
1022  if (Src1) {
1023  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
1024  MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
1025  if (Src1Sel) {
1026  SDWAInst.add(*Src1Sel);
1027  } else {
1029  }
1030  }
1031 
1032  // Check for a preserved register that needs to be copied.
1033  auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1034  if (DstUnused &&
1036  // We expect, if we are here, that the instruction was already in it's SDWA form,
1037  // with a tied operand.
1038  assert(Dst && Dst->isTied());
1039  assert(Opcode == static_cast<unsigned int>(SDWAOpcode));
1040  // We also expect a vdst, since sdst can't preserve.
1041  auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
1042  assert(PreserveDstIdx != -1);
1043 
1044  auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
1045  auto Tied = MI.getOperand(TiedIdx);
1046 
1047  SDWAInst.add(Tied);
1048  SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
1049  }
1050 
1051  // Apply all sdwa operand patterns.
1052  bool Converted = false;
1053  for (auto &Operand : SDWAOperands) {
1054  LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand);
1055  // There should be no intesection between SDWA operands and potential MIs
1056  // e.g.:
1057  // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1058  // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1059  // v_add_u32 v3, v4, v2
1060  //
1061  // In that example it is possible that we would fold 2nd instruction into 3rd
1062  // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
1063  // already destroyed). So if SDWAOperand is also a potential MI then do not
1064  // apply it.
1065  if (PotentialMatches.count(Operand->getParentInst()) == 0)
1066  Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1067  }
1068  if (Converted) {
1069  ConvertedInstructions.push_back(SDWAInst);
1070  } else {
1071  SDWAInst->eraseFromParent();
1072  return false;
1073  }
1074 
1075  LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n');
1076  ++NumSDWAInstructionsPeepholed;
1077 
1078  MI.eraseFromParent();
1079  return true;
1080 }
1081 
1082 // If an instruction was converted to SDWA it should not have immediates or SGPR
1083 // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1084 void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1085  const SISubtarget &ST) const {
1086  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1087  unsigned ConstantBusCount = 0;
1088  for (MachineOperand &Op : MI.explicit_uses()) {
1089  if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1090  continue;
1091 
1092  unsigned I = MI.getOperandNo(&Op);
1093  if (Desc.OpInfo[I].RegClass == -1 ||
1094  !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
1095  continue;
1096 
1097  if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1098  TRI->isSGPRReg(*MRI, Op.getReg())) {
1099  ++ConstantBusCount;
1100  continue;
1101  }
1102 
1103  unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1104  auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1105  TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1106  if (Op.isImm())
1107  Copy.addImm(Op.getImm());
1108  else if (Op.isReg())
1109  Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1110  Op.getSubReg());
1111  Op.ChangeToRegister(VGPR, false);
1112  }
1113 }
1114 
1115 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
1116  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
1117 
1118  if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
1119  return false;
1120 
1121  MRI = &MF.getRegInfo();
1122  TRI = ST.getRegisterInfo();
1123  TII = ST.getInstrInfo();
1124 
1125  // Find all SDWA operands in MF.
1126  bool Ret = false;
1127  for (MachineBasicBlock &MBB : MF) {
1128  bool Changed = false;
1129  do {
1130  matchSDWAOperands(MBB);
1131 
1132  for (const auto &OperandPair : SDWAOperands) {
1133  const auto &Operand = OperandPair.second;
1134  MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1135  if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1136  PotentialMatches[PotentialMI].push_back(Operand.get());
1137  }
1138  }
1139 
1140  for (auto &PotentialPair : PotentialMatches) {
1141  MachineInstr &PotentialMI = *PotentialPair.first;
1142  convertToSDWA(PotentialMI, PotentialPair.second);
1143  }
1144 
1145  PotentialMatches.clear();
1146  SDWAOperands.clear();
1147 
1148  Changed = !ConvertedInstructions.empty();
1149 
1150  if (Changed)
1151  Ret = true;
1152  while (!ConvertedInstructions.empty())
1153  legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1154  } while (Changed);
1155  }
1156 
1157  return Ret;
1158 }
const MachineInstrBuilder & add(const MachineOperand &MO) const
Interface definition for SIRegisterInfo.
A common definition of LaneBitmask for use in TableGen and CodeGen.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
AMDGPU specific subclass of TargetSubtarget.
FunctionPass * createSIPeepholeSDWAPass()
Compute iterated dominance frontiers using a linear time algorithm.
Definition: AllocatorList.h:24
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds...
Definition: Compiler.h:449
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
Definition: MachineInstr.h:394
void ChangeToRegister(unsigned Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value...
char & SIPeepholeSDWAID
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:285
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
iterator_range< use_nodbg_iterator > use_nodbg_operands(unsigned Reg) const
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:161
unsigned getReg() const
getReg - Returns the register number.
unsigned getOperandNo(const_mop_iterator I) const
Returns the number of the operand iterator I points to.
Definition: MachineInstr.h:411
void setIsUndef(bool Val=true)
unsigned Reg
const SIInstrInfo * getInstrInfo() const override
unsigned getSubReg() const
static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS)
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:811
static MachineOperand * findSingleRegDef(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
STATISTIC(NumFunctions, "Total number of functions")
unsigned const TargetRegisterInfo * TRI
void setIsDead(bool Val=true)
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
#define DEBUG_TYPE
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const
unsigned getNumOperands() const
Access to explicit operands of the instruction.
Definition: MachineInstr.h:314
static MachineOperand * findSingleRegUse(const MachineOperand *Reg, const MachineRegisterInfo *MRI)
void eraseFromParent()
Unlink &#39;this&#39; from the containing basic block and delete it.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:311
bool hasSDWAOutModsVOPC() const
bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
iterator_range< def_iterator > def_operands(unsigned Reg) const
LLVM_READONLY int getSDWAOp(uint16_t Opcode)
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
unsigned const MachineRegisterInfo * MRI
bool isFoldableCopy(const MachineInstr &MI) const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
static void copyRegOperand(MachineOperand &To, const MachineOperand &From)
bool hasVGPRs(const TargetRegisterClass *RC) const
void initializeSIPeepholeSDWAPass(PassRegistry &)
Represent the analysis usage information of a pass.
iterator_range< mop_iterator > defs()
Returns a range over all explicit operands that are register definitions.
Definition: MachineInstr.h:383
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:285
static void print(raw_ostream &Out, object::Archive::Kind Kind, T Val)
self_iterator getIterator()
Definition: ilist_node.h:82
iterator_range< mop_iterator > explicit_uses()
Definition: MachineInstr.h:401
void setIsKill(bool Val=true)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:34
unsigned findTiedOperandIdx(unsigned OpIdx) const
Given the index of a tied register operand, find the operand it is tied to.
LLVM_READONLY int getVOPe32(uint16_t Opcode)
const SIRegisterInfo * getRegisterInfo() const override
MachineOperand class - Representation of each machine instruction operand.
This is a &#39;vector&#39; (really, a variable-sized array), optimized for the case when the array is small...
Definition: SmallVector.h:861
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:382
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:286
MachineInstr * remove(MachineInstr *I)
Remove the unbundled instruction from the instruction list without deleting it.
int64_t getImm() const
MachineInstr * getUniqueVRegDef(unsigned Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
const Function & getFunction() const
Return the LLVM function that this machine code represents.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:133
The access may modify the value stored in memory.
Target - Wrapper for Target specific information.
static unsigned getReg(const void *D, unsigned RC, unsigned RegNo)
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:156
MachineRegisterInfo - Keep track of information for virtual and physical registers, including vreg register classes, use/def chains for registers, etc.
Provides AMDGPU specific target descriptions.
Representation of each machine instruction.
Definition: MachineInstr.h:60
Interface definition for SIInstrInfo.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
LLVM_NODISCARD bool empty() const
Definition: SmallVector.h:62
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:390
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:73
void setReg(unsigned Reg)
Change the register this operand corresponds to.
#define I(x, y, z)
Definition: MD5.cpp:58
void setSubReg(unsigned subReg)
const MachineInstrBuilder & addReg(unsigned RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
raw_ostream & operator<<(raw_ostream &OS, const APInt &I)
Definition: APInt.h:2023
bool isReg() const
isReg - Tests if this is a MO_Register operand.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:382
const MCOperandInfo * OpInfo
Definition: MCInstrDesc.h:172
This class implements an extremely fast bulk output stream that can only output to a stream...
Definition: raw_ostream.h:46
IRTranslator LLVM IR MI
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:49
bool hasSDWAScalar() const
#define LLVM_DEBUG(X)
Definition: Debug.h:119
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:316
iterator_range< use_instr_nodbg_iterator > use_nodbg_instructions(unsigned Reg) const
unsigned createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.