Bug Summary

File:llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
Warning:line 395, column 11
Forming reference to null pointer

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name SIPeepholeSDWA.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/lib/Target/AMDGPU -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
1//===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This pass tries to apply several peephole SDWA patterns.
10///
11/// E.g. original:
12/// V_LSHRREV_B32_e32 %0, 16, %1
13/// V_ADD_CO_U32_e32 %2, %0, %3
14/// V_LSHLREV_B32_e32 %4, 16, %2
15///
16/// Replace:
17/// V_ADD_CO_U32_sdwa %4, %1, %3
18/// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
19///
20//===----------------------------------------------------------------------===//
21
22#include "AMDGPU.h"
23#include "GCNSubtarget.h"
24#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
25#include "llvm/ADT/MapVector.h"
26#include "llvm/ADT/Statistic.h"
27#include "llvm/CodeGen/MachineFunctionPass.h"
28
29using namespace llvm;
30
31#define DEBUG_TYPE"si-peephole-sdwa" "si-peephole-sdwa"
32
33STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.")static llvm::Statistic NumSDWAPatternsFound = {"si-peephole-sdwa"
, "NumSDWAPatternsFound", "Number of SDWA patterns found."}
;
34STATISTIC(NumSDWAInstructionsPeepholed,static llvm::Statistic NumSDWAInstructionsPeepholed = {"si-peephole-sdwa"
, "NumSDWAInstructionsPeepholed", "Number of instruction converted to SDWA."
}
35 "Number of instruction converted to SDWA.")static llvm::Statistic NumSDWAInstructionsPeepholed = {"si-peephole-sdwa"
, "NumSDWAInstructionsPeepholed", "Number of instruction converted to SDWA."
}
;
36
37namespace {
38
39class SDWAOperand;
40class SDWADstOperand;
41
42class SIPeepholeSDWA : public MachineFunctionPass {
43public:
44 using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>;
45
46private:
47 MachineRegisterInfo *MRI;
48 const SIRegisterInfo *TRI;
49 const SIInstrInfo *TII;
50
51 MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
52 MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches;
53 SmallVector<MachineInstr *, 8> ConvertedInstructions;
54
55 Optional<int64_t> foldToImm(const MachineOperand &Op) const;
56
57public:
58 static char ID;
59
60 SIPeepholeSDWA() : MachineFunctionPass(ID) {
61 initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
62 }
63
64 bool runOnMachineFunction(MachineFunction &MF) override;
65 void matchSDWAOperands(MachineBasicBlock &MBB);
66 std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI);
67 bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const;
68 void pseudoOpConvertToVOP2(MachineInstr &MI,
69 const GCNSubtarget &ST) const;
70 bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
71 void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const;
72
73 StringRef getPassName() const override { return "SI Peephole SDWA"; }
74
75 void getAnalysisUsage(AnalysisUsage &AU) const override {
76 AU.setPreservesCFG();
77 MachineFunctionPass::getAnalysisUsage(AU);
78 }
79};
80
81class SDWAOperand {
82private:
83 MachineOperand *Target; // Operand that would be used in converted instruction
84 MachineOperand *Replaced; // Operand that would be replace by Target
85
86public:
87 SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
88 : Target(TargetOp), Replaced(ReplacedOp) {
89 assert(Target->isReg())(static_cast<void> (0));
90 assert(Replaced->isReg())(static_cast<void> (0));
91 }
92
93 virtual ~SDWAOperand() = default;
94
95 virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
96 virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
97
98 MachineOperand *getTargetOperand() const { return Target; }
99 MachineOperand *getReplacedOperand() const { return Replaced; }
100 MachineInstr *getParentInst() const { return Target->getParent(); }
101
102 MachineRegisterInfo *getMRI() const {
103 return &getParentInst()->getParent()->getParent()->getRegInfo();
104 }
105
106#if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP)
107 virtual void print(raw_ostream& OS) const = 0;
108 void dump() const { print(dbgs()); }
109#endif
110};
111
112using namespace AMDGPU::SDWA;
113
114class SDWASrcOperand : public SDWAOperand {
115private:
116 SdwaSel SrcSel;
117 bool Abs;
118 bool Neg;
119 bool Sext;
120
121public:
122 SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
123 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
124 bool Sext_ = false)
125 : SDWAOperand(TargetOp, ReplacedOp),
126 SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {}
127
128 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
129 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
130
131 SdwaSel getSrcSel() const { return SrcSel; }
132 bool getAbs() const { return Abs; }
133 bool getNeg() const { return Neg; }
134 bool getSext() const { return Sext; }
135
136 uint64_t getSrcMods(const SIInstrInfo *TII,
137 const MachineOperand *SrcOp) const;
138
139#if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP)
140 void print(raw_ostream& OS) const override;
141#endif
142};
143
144class SDWADstOperand : public SDWAOperand {
145private:
146 SdwaSel DstSel;
147 DstUnused DstUn;
148
149public:
150
151 SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
152 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
153 : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
154
155 MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
156 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
157
158 SdwaSel getDstSel() const { return DstSel; }
159 DstUnused getDstUnused() const { return DstUn; }
160
161#if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP)
162 void print(raw_ostream& OS) const override;
163#endif
164};
165
166class SDWADstPreserveOperand : public SDWADstOperand {
167private:
168 MachineOperand *Preserve;
169
170public:
171 SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
172 MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD)
173 : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE),
174 Preserve(PreserveOp) {}
175
176 bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
177
178 MachineOperand *getPreservedOperand() const { return Preserve; }
179
180#if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP)
181 void print(raw_ostream& OS) const override;
182#endif
183};
184
185} // end anonymous namespace
186
187INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)static void *initializeSIPeepholeSDWAPassOnce(PassRegistry &
Registry) { PassInfo *PI = new PassInfo( "SI Peephole SDWA", "si-peephole-sdwa"
, &SIPeepholeSDWA::ID, PassInfo::NormalCtor_t(callDefaultCtor
<SIPeepholeSDWA>), false, false); Registry.registerPass
(*PI, true); return PI; } static llvm::once_flag InitializeSIPeepholeSDWAPassFlag
; void llvm::initializeSIPeepholeSDWAPass(PassRegistry &Registry
) { llvm::call_once(InitializeSIPeepholeSDWAPassFlag, initializeSIPeepholeSDWAPassOnce
, std::ref(Registry)); }
188
189char SIPeepholeSDWA::ID = 0;
190
191char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
192
193FunctionPass *llvm::createSIPeepholeSDWAPass() {
194 return new SIPeepholeSDWA();
195}
196
197
198#if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP)
199static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) {
200 switch(Sel) {
201 case BYTE_0: OS << "BYTE_0"; break;
202 case BYTE_1: OS << "BYTE_1"; break;
203 case BYTE_2: OS << "BYTE_2"; break;
204 case BYTE_3: OS << "BYTE_3"; break;
205 case WORD_0: OS << "WORD_0"; break;
206 case WORD_1: OS << "WORD_1"; break;
207 case DWORD: OS << "DWORD"; break;
208 }
209 return OS;
210}
211
212static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
213 switch(Un) {
214 case UNUSED_PAD: OS << "UNUSED_PAD"; break;
215 case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
216 case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
217 }
218 return OS;
219}
220
221LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__))
222void SDWASrcOperand::print(raw_ostream& OS) const {
223 OS << "SDWA src: " << *getTargetOperand()
224 << " src_sel:" << getSrcSel()
225 << " abs:" << getAbs() << " neg:" << getNeg()
226 << " sext:" << getSext() << '\n';
227}
228
229LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__))
230void SDWADstOperand::print(raw_ostream& OS) const {
231 OS << "SDWA dst: " << *getTargetOperand()
232 << " dst_sel:" << getDstSel()
233 << " dst_unused:" << getDstUnused() << '\n';
234}
235
236LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__))
237void SDWADstPreserveOperand::print(raw_ostream& OS) const {
238 OS << "SDWA preserve dst: " << *getTargetOperand()
239 << " dst_sel:" << getDstSel()
240 << " preserve:" << *getPreservedOperand() << '\n';
241}
242
243#endif
244
245static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
246 assert(To.isReg() && From.isReg())(static_cast<void> (0));
247 To.setReg(From.getReg());
248 To.setSubReg(From.getSubReg());
249 To.setIsUndef(From.isUndef());
250 if (To.isUse()) {
251 To.setIsKill(From.isKill());
252 } else {
253 To.setIsDead(From.isDead());
254 }
255}
256
257static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
258 return LHS.isReg() &&
2
Returning zero, which participates in a condition later
259 RHS.isReg() &&
260 LHS.getReg() == RHS.getReg() &&
261 LHS.getSubReg() == RHS.getSubReg();
262}
263
264static MachineOperand *findSingleRegUse(const MachineOperand *Reg,
265 const MachineRegisterInfo *MRI) {
266 if (!Reg->isReg() || !Reg->isDef())
267 return nullptr;
268
269 MachineOperand *ResMO = nullptr;
270 for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) {
271 // If there exist use of subreg of Reg then return nullptr
272 if (!isSameReg(UseMO, *Reg))
273 return nullptr;
274
275 // Check that there is only one instruction that uses Reg
276 if (!ResMO) {
277 ResMO = &UseMO;
278 } else if (ResMO->getParent() != UseMO.getParent()) {
279 return nullptr;
280 }
281 }
282
283 return ResMO;
284}
285
286static MachineOperand *findSingleRegDef(const MachineOperand *Reg,
287 const MachineRegisterInfo *MRI) {
288 if (!Reg->isReg())
289 return nullptr;
290
291 MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg());
292 if (!DefInstr)
293 return nullptr;
294
295 for (auto &DefMO : DefInstr->defs()) {
296 if (DefMO.isReg() && DefMO.getReg() == Reg->getReg())
297 return &DefMO;
298 }
299
300 // Ignore implicit defs.
301 return nullptr;
302}
303
304uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII,
305 const MachineOperand *SrcOp) const {
306 uint64_t Mods = 0;
307 const auto *MI = SrcOp->getParent();
308 if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) {
309 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) {
310 Mods = Mod->getImm();
311 }
312 } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) {
313 if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) {
314 Mods = Mod->getImm();
315 }
316 }
317 if (Abs || Neg) {
318 assert(!Sext &&(static_cast<void> (0))
319 "Float and integer src modifiers can't be set simulteniously")(static_cast<void> (0));
320 Mods |= Abs ? SISrcMods::ABS : 0u;
321 Mods ^= Neg ? SISrcMods::NEG : 0u;
322 } else if (Sext) {
323 Mods |= SISrcMods::SEXT;
324 }
325
326 return Mods;
327}
328
329MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
330 // For SDWA src operand potential instruction is one that use register
331 // defined by parent instruction
332 MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI());
333 if (!PotentialMO)
334 return nullptr;
335
336 return PotentialMO->getParent();
337}
338
339bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
340 // Find operand in instruction that matches source operand and replace it with
341 // target operand. Set corresponding src_sel
342 bool IsPreserveSrc = false;
343 MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
344 MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
345 MachineOperand *SrcMods =
346 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
347 assert(Src && (Src->isReg() || Src->isImm()))(static_cast<void> (0));
348 if (!isSameReg(*Src, *getReplacedOperand())) {
1
Calling 'isSameReg'
3
Returning from 'isSameReg'
4
Taking true branch
349 // If this is not src0 then it could be src1
350 Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
5
Value assigned to 'Src'
351 SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
352 SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
353
354 if (!Src ||
6
Assuming 'Src' is null
355 !isSameReg(*Src, *getReplacedOperand())) {
356 // It's possible this Src is a tied operand for
357 // UNUSED_PRESERVE, in which case we can either
358 // abandon the peephole attempt, or if legal we can
359 // copy the target operand into the tied slot
360 // if the preserve operation will effectively cause the same
361 // result by overwriting the rest of the dst.
362 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
363 MachineOperand *DstUnused =
364 TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
365
366 if (Dst &&
7
Assuming 'Dst' is null
367 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
368 // This will work if the tied src is acessing WORD_0, and the dst is
369 // writing WORD_1. Modifiers don't matter because all the bits that
370 // would be impacted are being overwritten by the dst.
371 // Any other case will not work.
372 SdwaSel DstSel = static_cast<SdwaSel>(
373 TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel));
374 if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 &&
375 getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) {
376 IsPreserveSrc = true;
377 auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
378 AMDGPU::OpName::vdst);
379 auto TiedIdx = MI.findTiedOperandIdx(DstIdx);
380 Src = &MI.getOperand(TiedIdx);
381 SrcSel = nullptr;
382 SrcMods = nullptr;
383 } else {
384 // Not legal to convert this src
385 return false;
386 }
387 }
388 }
389 assert(Src && Src->isReg())(static_cast<void> (0));
390
391 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
8
Assuming the condition is true
392 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
393 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
394 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
395 !isSameReg(*Src, *getReplacedOperand())) {
9
Forming reference to null pointer
396 // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
397 // src2. This is not allowed.
398 return false;
399 }
400
401 assert(isSameReg(*Src, *getReplacedOperand()) &&(static_cast<void> (0))
402 (IsPreserveSrc || (SrcSel && SrcMods)))(static_cast<void> (0));
403 }
404 copyRegOperand(*Src, *getTargetOperand());
405 if (!IsPreserveSrc) {
406 SrcSel->setImm(getSrcSel());
407 SrcMods->setImm(getSrcMods(TII, Src));
408 }
409 getTargetOperand()->setIsKill(false);
410 return true;
411}
412
413MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
414 // For SDWA dst operand potential instruction is one that defines register
415 // that this operand uses
416 MachineRegisterInfo *MRI = getMRI();
417 MachineInstr *ParentMI = getParentInst();
418
419 MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI);
420 if (!PotentialMO)
421 return nullptr;
422
423 // Check that ParentMI is the only instruction that uses replaced register
424 for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) {
425 if (&UseInst != ParentMI)
426 return nullptr;
427 }
428
429 return PotentialMO->getParent();
430}
431
432bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
433 // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
434
435 if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa ||
436 MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa ||
437 MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
438 MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
439 getDstSel() != AMDGPU::SDWA::DWORD) {
440 // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
441 return false;
442 }
443
444 MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
445 assert(Operand &&(static_cast<void> (0))
446 Operand->isReg() &&(static_cast<void> (0))
447 isSameReg(*Operand, *getReplacedOperand()))(static_cast<void> (0));
448 copyRegOperand(*Operand, *getTargetOperand());
449 MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
450 assert(DstSel)(static_cast<void> (0));
451 DstSel->setImm(getDstSel());
452 MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
453 assert(DstUnused)(static_cast<void> (0));
454 DstUnused->setImm(getDstUnused());
455
456 // Remove original instruction because it would conflict with our new
457 // instruction by register definition
458 getParentInst()->eraseFromParent();
459 return true;
460}
461
462bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI,
463 const SIInstrInfo *TII) {
464 // MI should be moved right before v_or_b32.
465 // For this we should clear all kill flags on uses of MI src-operands or else
466 // we can encounter problem with use of killed operand.
467 for (MachineOperand &MO : MI.uses()) {
468 if (!MO.isReg())
469 continue;
470 getMRI()->clearKillFlags(MO.getReg());
471 }
472
473 // Move MI before v_or_b32
474 auto MBB = MI.getParent();
475 MBB->remove(&MI);
476 MBB->insert(getParentInst(), &MI);
477
478 // Add Implicit use of preserved register
479 MachineInstrBuilder MIB(*MBB->getParent(), MI);
480 MIB.addReg(getPreservedOperand()->getReg(),
481 RegState::ImplicitKill,
482 getPreservedOperand()->getSubReg());
483
484 // Tie dst to implicit use
485 MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst),
486 MI.getNumOperands() - 1);
487
488 // Convert MI as any other SDWADstOperand and remove v_or_b32
489 return SDWADstOperand::convertToSDWA(MI, TII);
490}
491
492Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
493 if (Op.isImm()) {
494 return Op.getImm();
495 }
496
497 // If this is not immediate then it can be copy of immediate value, e.g.:
498 // %1 = S_MOV_B32 255;
499 if (Op.isReg()) {
500 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
501 if (!isSameReg(Op, Def))
502 continue;
503
504 const MachineInstr *DefInst = Def.getParent();
505 if (!TII->isFoldableCopy(*DefInst))
506 return None;
507
508 const MachineOperand &Copied = DefInst->getOperand(1);
509 if (!Copied.isImm())
510 return None;
511
512 return Copied.getImm();
513 }
514 }
515
516 return None;
517}
518
519std::unique_ptr<SDWAOperand>
520SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
521 unsigned Opcode = MI.getOpcode();
522 switch (Opcode) {
523 case AMDGPU::V_LSHRREV_B32_e32:
524 case AMDGPU::V_ASHRREV_I32_e32:
525 case AMDGPU::V_LSHLREV_B32_e32:
526 case AMDGPU::V_LSHRREV_B32_e64:
527 case AMDGPU::V_ASHRREV_I32_e64:
528 case AMDGPU::V_LSHLREV_B32_e64: {
529 // from: v_lshrrev_b32_e32 v1, 16/24, v0
530 // to SDWA src:v0 src_sel:WORD_1/BYTE_3
531
532 // from: v_ashrrev_i32_e32 v1, 16/24, v0
533 // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
534
535 // from: v_lshlrev_b32_e32 v1, 16/24, v0
536 // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
537 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
538 auto Imm = foldToImm(*Src0);
539 if (!Imm)
540 break;
541
542 if (*Imm != 16 && *Imm != 24)
543 break;
544
545 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
546 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
547 if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
548 break;
549
550 if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
551 Opcode == AMDGPU::V_LSHLREV_B32_e64) {
552 return std::make_unique<SDWADstOperand>(
553 Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
554 } else {
555 return std::make_unique<SDWASrcOperand>(
556 Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
557 Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
558 Opcode != AMDGPU::V_LSHRREV_B32_e64);
559 }
560 break;
561 }
562
563 case AMDGPU::V_LSHRREV_B16_e32:
564 case AMDGPU::V_ASHRREV_I16_e32:
565 case AMDGPU::V_LSHLREV_B16_e32:
566 case AMDGPU::V_LSHRREV_B16_e64:
567 case AMDGPU::V_ASHRREV_I16_e64:
568 case AMDGPU::V_LSHLREV_B16_e64: {
569 // from: v_lshrrev_b16_e32 v1, 8, v0
570 // to SDWA src:v0 src_sel:BYTE_1
571
572 // from: v_ashrrev_i16_e32 v1, 8, v0
573 // to SDWA src:v0 src_sel:BYTE_1 sext:1
574
575 // from: v_lshlrev_b16_e32 v1, 8, v0
576 // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
577 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
578 auto Imm = foldToImm(*Src0);
579 if (!Imm || *Imm != 8)
580 break;
581
582 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
583 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
584
585 if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
586 break;
587
588 if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
589 Opcode == AMDGPU::V_LSHLREV_B16_e64) {
590 return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
591 } else {
592 return std::make_unique<SDWASrcOperand>(
593 Src1, Dst, BYTE_1, false, false,
594 Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
595 Opcode != AMDGPU::V_LSHRREV_B16_e64);
596 }
597 break;
598 }
599
600 case AMDGPU::V_BFE_I32_e64:
601 case AMDGPU::V_BFE_U32_e64: {
602 // e.g.:
603 // from: v_bfe_u32 v1, v0, 8, 8
604 // to SDWA src:v0 src_sel:BYTE_1
605
606 // offset | width | src_sel
607 // ------------------------
608 // 0 | 8 | BYTE_0
609 // 0 | 16 | WORD_0
610 // 0 | 32 | DWORD ?
611 // 8 | 8 | BYTE_1
612 // 16 | 8 | BYTE_2
613 // 16 | 16 | WORD_1
614 // 24 | 8 | BYTE_3
615
616 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
617 auto Offset = foldToImm(*Src1);
618 if (!Offset)
619 break;
620
621 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
622 auto Width = foldToImm(*Src2);
623 if (!Width)
624 break;
625
626 SdwaSel SrcSel = DWORD;
627
628 if (*Offset == 0 && *Width == 8)
629 SrcSel = BYTE_0;
630 else if (*Offset == 0 && *Width == 16)
631 SrcSel = WORD_0;
632 else if (*Offset == 0 && *Width == 32)
633 SrcSel = DWORD;
634 else if (*Offset == 8 && *Width == 8)
635 SrcSel = BYTE_1;
636 else if (*Offset == 16 && *Width == 8)
637 SrcSel = BYTE_2;
638 else if (*Offset == 16 && *Width == 16)
639 SrcSel = WORD_1;
640 else if (*Offset == 24 && *Width == 8)
641 SrcSel = BYTE_3;
642 else
643 break;
644
645 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
646 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
647
648 if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical())
649 break;
650
651 return std::make_unique<SDWASrcOperand>(
652 Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64);
653 }
654
655 case AMDGPU::V_AND_B32_e32:
656 case AMDGPU::V_AND_B32_e64: {
657 // e.g.:
658 // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
659 // to SDWA src:v0 src_sel:WORD_0/BYTE_0
660
661 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
662 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
663 auto ValSrc = Src1;
664 auto Imm = foldToImm(*Src0);
665
666 if (!Imm) {
667 Imm = foldToImm(*Src1);
668 ValSrc = Src0;
669 }
670
671 if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff))
672 break;
673
674 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
675
676 if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical())
677 break;
678
679 return std::make_unique<SDWASrcOperand>(
680 ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
681 }
682
683 case AMDGPU::V_OR_B32_e32:
684 case AMDGPU::V_OR_B32_e64: {
685 // Patterns for dst_unused:UNUSED_PRESERVE.
686 // e.g., from:
687 // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD
688 // src1_sel:WORD_1 src2_sel:WORD1
689 // v_add_f16_e32 v3, v1, v2
690 // v_or_b32_e32 v4, v0, v3
691 // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3
692
693 // Check if one of operands of v_or_b32 is SDWA instruction
694 using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>;
695 auto CheckOROperandsForSDWA =
696 [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType {
697 if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg())
698 return CheckRetType(None);
699
700 MachineOperand *Op1Def = findSingleRegDef(Op1, MRI);
701 if (!Op1Def)
702 return CheckRetType(None);
703
704 MachineInstr *Op1Inst = Op1Def->getParent();
705 if (!TII->isSDWA(*Op1Inst))
706 return CheckRetType(None);
707
708 MachineOperand *Op2Def = findSingleRegDef(Op2, MRI);
709 if (!Op2Def)
710 return CheckRetType(None);
711
712 return CheckRetType(std::make_pair(Op1Def, Op2Def));
713 };
714
715 MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
716 MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
717 assert(OrSDWA && OrOther)(static_cast<void> (0));
718 auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
719 if (!Res) {
720 OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
721 OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
722 assert(OrSDWA && OrOther)(static_cast<void> (0));
723 Res = CheckOROperandsForSDWA(OrSDWA, OrOther);
724 if (!Res)
725 break;
726 }
727
728 MachineOperand *OrSDWADef = Res->first;
729 MachineOperand *OrOtherDef = Res->second;
730 assert(OrSDWADef && OrOtherDef)(static_cast<void> (0));
731
732 MachineInstr *SDWAInst = OrSDWADef->getParent();
733 MachineInstr *OtherInst = OrOtherDef->getParent();
734
735 // Check that OtherInstr is actually bitwise compatible with SDWAInst = their
736 // destination patterns don't overlap. Compatible instruction can be either
737 // regular instruction with compatible bitness or SDWA instruction with
738 // correct dst_sel
739 // SDWAInst | OtherInst bitness / OtherInst dst_sel
740 // -----------------------------------------------------
741 // DWORD | no / no
742 // WORD_0 | no / BYTE_2/3, WORD_1
743 // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0
744 // BYTE_0 | no / BYTE_1/2/3, WORD_1
745 // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1
746 // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0
747 // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0
748 // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK
749 // but v_add_f32 is not.
750
751 // TODO: add support for non-SDWA instructions as OtherInst.
752 // For now this only works with SDWA instructions. For regular instructions
753 // there is no way to determine if the instruction writes only 8/16/24-bit
754 // out of full register size and all registers are at min 32-bit wide.
755 if (!TII->isSDWA(*OtherInst))
756 break;
757
758 SdwaSel DstSel = static_cast<SdwaSel>(
759 TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
760 SdwaSel OtherDstSel = static_cast<SdwaSel>(
761 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
762
763 bool DstSelAgree = false;
764 switch (DstSel) {
765 case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) ||
766 (OtherDstSel == BYTE_3) ||
767 (OtherDstSel == WORD_1));
768 break;
769 case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
770 (OtherDstSel == BYTE_1) ||
771 (OtherDstSel == WORD_0));
772 break;
773 case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) ||
774 (OtherDstSel == BYTE_2) ||
775 (OtherDstSel == BYTE_3) ||
776 (OtherDstSel == WORD_1));
777 break;
778 case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) ||
779 (OtherDstSel == BYTE_2) ||
780 (OtherDstSel == BYTE_3) ||
781 (OtherDstSel == WORD_1));
782 break;
783 case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) ||
784 (OtherDstSel == BYTE_1) ||
785 (OtherDstSel == BYTE_3) ||
786 (OtherDstSel == WORD_0));
787 break;
788 case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) ||
789 (OtherDstSel == BYTE_1) ||
790 (OtherDstSel == BYTE_2) ||
791 (OtherDstSel == WORD_0));
792 break;
793 default: DstSelAgree = false;
794 }
795
796 if (!DstSelAgree)
797 break;
798
799 // Also OtherInst dst_unused should be UNUSED_PAD
800 DstUnused OtherDstUnused = static_cast<DstUnused>(
801 TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused));
802 if (OtherDstUnused != DstUnused::UNUSED_PAD)
803 break;
804
805 // Create DstPreserveOperand
806 MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
807 assert(OrDst && OrDst->isReg())(static_cast<void> (0));
808
809 return std::make_unique<SDWADstPreserveOperand>(
810 OrDst, OrSDWADef, OrOtherDef, DstSel);
811
812 }
813 }
814
815 return std::unique_ptr<SDWAOperand>(nullptr);
816}
817
818#if !defined(NDEBUG1)
819static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
820 Operand.print(OS);
821 return OS;
822}
823#endif
824
825void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
826 for (MachineInstr &MI : MBB) {
827 if (auto Operand = matchSDWAOperand(MI)) {
828 LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n')do { } while (false);
829 SDWAOperands[&MI] = std::move(Operand);
830 ++NumSDWAPatternsFound;
831 }
832 }
833}
834
835// Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and
836// V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA
837// to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa.
838//
839// We are transforming from a VOP3 into a VOP2 form of the instruction.
840// %19:vgpr_32 = V_AND_B32_e32 255,
841// killed %16:vgpr_32, implicit $exec
842// %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64
843// %26.sub0:vreg_64, %19:vgpr_32, implicit $exec
844// %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64
845// %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec
846//
847// becomes
848// %47:vgpr_32 = V_ADD_CO_U32_sdwa
849// 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0,
850// implicit-def $vcc, implicit $exec
851// %48:vgpr_32 = V_ADDC_U32_e32
852// 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec
853void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
854 const GCNSubtarget &ST) const {
855 int Opc = MI.getOpcode();
856 assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&(static_cast<void> (0))
857 "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64")(static_cast<void> (0));
858
859 // Can the candidate MI be shrunk?
860 if (!TII->canShrink(MI, *MRI))
861 return;
862 Opc = AMDGPU::getVOPe32(Opc);
863 // Find the related ADD instruction.
864 const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
865 if (!Sdst)
866 return;
867 MachineOperand *NextOp = findSingleRegUse(Sdst, MRI);
868 if (!NextOp)
869 return;
870 MachineInstr &MISucc = *NextOp->getParent();
871 // Can the successor be shrunk?
872 if (!TII->canShrink(MISucc, *MRI))
873 return;
874 int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode());
875 // Make sure the carry in/out are subsequently unused.
876 MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2);
877 if (!CarryIn)
878 return;
879 MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst);
880 if (!CarryOut)
881 return;
882 if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg()))
883 return;
884 // Make sure VCC or its subregs are dead before MI.
885 MachineBasicBlock &MBB = *MI.getParent();
886 auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25);
887 if (Liveness != MachineBasicBlock::LQR_Dead)
888 return;
889 // Check if VCC is referenced in range of (MI,MISucc].
890 for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator();
891 I != E; ++I) {
892 if (I->modifiesRegister(AMDGPU::VCC, TRI))
893 return;
894 }
895
896 // Make the two new e32 instruction variants.
897 // Replace MI with V_{SUB|ADD}_I32_e32
898 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
899 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
900 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
901 .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
902 .setMIFlags(MI.getFlags());
903
904 MI.eraseFromParent();
905
906 // Replace MISucc with V_{SUBB|ADDC}_U32_e32
907 BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc))
908 .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst))
909 .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0))
910 .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1))
911 .setMIFlags(MISucc.getFlags());
912
913 MISucc.eraseFromParent();
914}
915
916bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI,
917 const GCNSubtarget &ST) const {
918 // Check if this is already an SDWA instruction
919 unsigned Opc = MI.getOpcode();
920 if (TII->isSDWA(Opc))
921 return true;
922
923 // Check if this instruction has opcode that supports SDWA
924 if (AMDGPU::getSDWAOp(Opc) == -1)
925 Opc = AMDGPU::getVOPe32(Opc);
926
927 if (AMDGPU::getSDWAOp(Opc) == -1)
928 return false;
929
930 if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
931 return false;
932
933 if (TII->isVOPC(Opc)) {
934 if (!ST.hasSDWASdst()) {
935 const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
936 if (SDst && (SDst->getReg() != AMDGPU::VCC &&
937 SDst->getReg() != AMDGPU::VCC_LO))
938 return false;
939 }
940
941 if (!ST.hasSDWAOutModsVOPC() &&
942 (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
943 TII->hasModifiersSet(MI, AMDGPU::OpName::omod)))
944 return false;
945
946 } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) ||
947 !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
948 return false;
949 }
950
951 if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 ||
952 Opc == AMDGPU::V_FMAC_F32_e32 ||
953 Opc == AMDGPU::V_MAC_F16_e32 ||
954 Opc == AMDGPU::V_MAC_F32_e32))
955 return false;
956
957 // Check if target supports this SDWA opcode
958 if (TII->pseudoToMCOpcode(Opc) == -1)
959 return false;
960
961 // FIXME: has SDWA but require handling of implicit VCC use
962 if (Opc == AMDGPU::V_CNDMASK_B32_e32)
963 return false;
964
965 if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) {
966 if (!Src0->isReg() && !Src0->isImm())
967 return false;
968 }
969
970 if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) {
971 if (!Src1->isReg() && !Src1->isImm())
972 return false;
973 }
974
975 return true;
976}
977
978bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
979 const SDWAOperandsVector &SDWAOperands) {
980
981 LLVM_DEBUG(dbgs() << "Convert instruction:" << MI)do { } while (false);
982
983 // Convert to sdwa
984 int SDWAOpcode;
985 unsigned Opcode = MI.getOpcode();
986 if (TII->isSDWA(Opcode)) {
987 SDWAOpcode = Opcode;
988 } else {
989 SDWAOpcode = AMDGPU::getSDWAOp(Opcode);
990 if (SDWAOpcode == -1)
991 SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode));
992 }
993 assert(SDWAOpcode != -1)(static_cast<void> (0));
994
995 const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
996
997 // Create SDWA version of instruction MI and initialize its operands
998 MachineInstrBuilder SDWAInst =
999 BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)
1000 .setMIFlags(MI.getFlags());
1001
1002 // Copy dst, if it is present in original then should also be present in SDWA
1003 MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
1004 if (Dst) {
1005 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1)(static_cast<void> (0));
1006 SDWAInst.add(*Dst);
1007 } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) {
1008 assert(Dst &&(static_cast<void> (0))
1009 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1)(static_cast<void> (0));
1010 SDWAInst.add(*Dst);
1011 } else {
1012 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1)(static_cast<void> (0));
1013 SDWAInst.addReg(TRI->getVCC(), RegState::Define);
1014 }
1015
1016 // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
1017 // src0_modifiers (except for v_nop_sdwa, but it can't get here)
1018 MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
1019 assert((static_cast<void> (0))
1020 Src0 &&(static_cast<void> (0))
1021 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&(static_cast<void> (0))
1022 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1)(static_cast<void> (0));
1023 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers))
1024 SDWAInst.addImm(Mod->getImm());
1025 else
1026 SDWAInst.addImm(0);
1027 SDWAInst.add(*Src0);
1028
1029 // Copy src1 if present, initialize src1_modifiers.
1030 MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
1031 if (Src1) {
1032 assert((static_cast<void> (0))
1033 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&(static_cast<void> (0))
1034 AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1)(static_cast<void> (0));
1035 if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers))
1036 SDWAInst.addImm(Mod->getImm());
1037 else
1038 SDWAInst.addImm(0);
1039 SDWAInst.add(*Src1);
1040 }
1041
1042 if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa ||
1043 SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa ||
1044 SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
1045 SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
1046 // v_mac_f16/32 has additional src2 operand tied to vdst
1047 MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
1048 assert(Src2)(static_cast<void> (0));
1049 SDWAInst.add(*Src2);
1050 }
1051
1052 // Copy clamp if present, initialize otherwise
1053 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1)(static_cast<void> (0));
1054 MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
1055 if (Clamp) {
1056 SDWAInst.add(*Clamp);
1057 } else {
1058 SDWAInst.addImm(0);
1059 }
1060
1061 // Copy omod if present, initialize otherwise if needed
1062 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
1063 MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
1064 if (OMod) {
1065 SDWAInst.add(*OMod);
1066 } else {
1067 SDWAInst.addImm(0);
1068 }
1069 }
1070
1071 // Copy dst_sel if present, initialize otherwise if needed
1072 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) {
1073 MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
1074 if (DstSel) {
1075 SDWAInst.add(*DstSel);
1076 } else {
1077 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1078 }
1079 }
1080
1081 // Copy dst_unused if present, initialize otherwise if needed
1082 if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) {
1083 MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1084 if (DstUnused) {
1085 SDWAInst.add(*DstUnused);
1086 } else {
1087 SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
1088 }
1089 }
1090
1091 // Copy src0_sel if present, initialize otherwise
1092 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1)(static_cast<void> (0));
1093 MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
1094 if (Src0Sel) {
1095 SDWAInst.add(*Src0Sel);
1096 } else {
1097 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1098 }
1099
1100 // Copy src1_sel if present, initialize otherwise if needed
1101 if (Src1) {
1102 assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1)(static_cast<void> (0));
1103 MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
1104 if (Src1Sel) {
1105 SDWAInst.add(*Src1Sel);
1106 } else {
1107 SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
1108 }
1109 }
1110
1111 // Check for a preserved register that needs to be copied.
1112 auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
1113 if (DstUnused &&
1114 DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) {
1115 // We expect, if we are here, that the instruction was already in it's SDWA form,
1116 // with a tied operand.
1117 assert(Dst && Dst->isTied())(static_cast<void> (0));
1118 assert(Opcode == static_cast<unsigned int>(SDWAOpcode))(static_cast<void> (0));
1119 // We also expect a vdst, since sdst can't preserve.
1120 auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst);
1121 assert(PreserveDstIdx != -1)(static_cast<void> (0));
1122
1123 auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx);
1124 auto Tied = MI.getOperand(TiedIdx);
1125
1126 SDWAInst.add(Tied);
1127 SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1);
1128 }
1129
1130 // Apply all sdwa operand patterns.
1131 bool Converted = false;
1132 for (auto &Operand : SDWAOperands) {
1133 LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand)do { } while (false);
1134 // There should be no intesection between SDWA operands and potential MIs
1135 // e.g.:
1136 // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0
1137 // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0
1138 // v_add_u32 v3, v4, v2
1139 //
1140 // In that example it is possible that we would fold 2nd instruction into 3rd
1141 // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was
1142 // already destroyed). So if SDWAOperand is also a potential MI then do not
1143 // apply it.
1144 if (PotentialMatches.count(Operand->getParentInst()) == 0)
1145 Converted |= Operand->convertToSDWA(*SDWAInst, TII);
1146 }
1147 if (Converted) {
1148 ConvertedInstructions.push_back(SDWAInst);
1149 } else {
1150 SDWAInst->eraseFromParent();
1151 return false;
1152 }
1153
1154 LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n')do { } while (false);
1155 ++NumSDWAInstructionsPeepholed;
1156
1157 MI.eraseFromParent();
1158 return true;
1159}
1160
1161// If an instruction was converted to SDWA it should not have immediates or SGPR
1162// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
1163void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
1164 const GCNSubtarget &ST) const {
1165 const MCInstrDesc &Desc = TII->get(MI.getOpcode());
1166 unsigned ConstantBusCount = 0;
1167 for (MachineOperand &Op : MI.explicit_uses()) {
1168 if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
1169 continue;
1170
1171 unsigned I = MI.getOperandNo(&Op);
1172 if (Desc.OpInfo[I].RegClass == -1 ||
1173 !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
1174 continue;
1175
1176 if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
1177 TRI->isSGPRReg(*MRI, Op.getReg())) {
1178 ++ConstantBusCount;
1179 continue;
1180 }
1181
1182 Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1183 auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
1184 TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
1185 if (Op.isImm())
1186 Copy.addImm(Op.getImm());
1187 else if (Op.isReg())
1188 Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
1189 Op.getSubReg());
1190 Op.ChangeToRegister(VGPR, false);
1191 }
1192}
1193
1194bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
1195 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1196
1197 if (!ST.hasSDWA() || skipFunction(MF.getFunction()))
1198 return false;
1199
1200 MRI = &MF.getRegInfo();
1201 TRI = ST.getRegisterInfo();
1202 TII = ST.getInstrInfo();
1203
1204 // Find all SDWA operands in MF.
1205 bool Ret = false;
1206 for (MachineBasicBlock &MBB : MF) {
1207 bool Changed = false;
1208 do {
1209 // Preprocess the ADD/SUB pairs so they could be SDWA'ed.
1210 // Look for a possible ADD or SUB that resulted from a previously lowered
1211 // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2
1212 // lowers the pair of instructions into e32 form.
1213 matchSDWAOperands(MBB);
1214 for (const auto &OperandPair : SDWAOperands) {
1215 const auto &Operand = OperandPair.second;
1216 MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1217 if (PotentialMI &&
1218 (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 ||
1219 PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64))
1220 pseudoOpConvertToVOP2(*PotentialMI, ST);
1221 }
1222 SDWAOperands.clear();
1223
1224 // Generate potential match list.
1225 matchSDWAOperands(MBB);
1226
1227 for (const auto &OperandPair : SDWAOperands) {
1228 const auto &Operand = OperandPair.second;
1229 MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
1230 if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
1231 PotentialMatches[PotentialMI].push_back(Operand.get());
1232 }
1233 }
1234
1235 for (auto &PotentialPair : PotentialMatches) {
1236 MachineInstr &PotentialMI = *PotentialPair.first;
1237 convertToSDWA(PotentialMI, PotentialPair.second);
1238 }
1239
1240 PotentialMatches.clear();
1241 SDWAOperands.clear();
1242
1243 Changed = !ConvertedInstructions.empty();
1244
1245 if (Changed)
1246 Ret = true;
1247 while (!ConvertedInstructions.empty())
1248 legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
1249 } while (Changed);
1250 }
1251
1252 return Ret;
1253}