File: | llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp |
Warning: | line 404, column 3 Forming reference to null pointer |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions ---===// | |||
2 | // | |||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
4 | // See https://llvm.org/LICENSE.txt for license information. | |||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
6 | // | |||
7 | //===----------------------------------------------------------------------===// | |||
8 | // | |||
9 | /// \file This pass tries to apply several peephole SDWA patterns. | |||
10 | /// | |||
11 | /// E.g. original: | |||
12 | /// V_LSHRREV_B32_e32 %0, 16, %1 | |||
13 | /// V_ADD_CO_U32_e32 %2, %0, %3 | |||
14 | /// V_LSHLREV_B32_e32 %4, 16, %2 | |||
15 | /// | |||
16 | /// Replace: | |||
17 | /// V_ADD_CO_U32_sdwa %4, %1, %3 | |||
18 | /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD | |||
19 | /// | |||
20 | //===----------------------------------------------------------------------===// | |||
21 | ||||
22 | #include "AMDGPU.h" | |||
23 | #include "GCNSubtarget.h" | |||
24 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | |||
25 | #include "llvm/ADT/MapVector.h" | |||
26 | #include "llvm/ADT/Statistic.h" | |||
27 | #include "llvm/CodeGen/MachineFunctionPass.h" | |||
28 | ||||
29 | using namespace llvm; | |||
30 | ||||
31 | #define DEBUG_TYPE"si-peephole-sdwa" "si-peephole-sdwa" | |||
32 | ||||
33 | STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.")static llvm::Statistic NumSDWAPatternsFound = {"si-peephole-sdwa" , "NumSDWAPatternsFound", "Number of SDWA patterns found."}; | |||
34 | STATISTIC(NumSDWAInstructionsPeepholed,static llvm::Statistic NumSDWAInstructionsPeepholed = {"si-peephole-sdwa" , "NumSDWAInstructionsPeepholed", "Number of instruction converted to SDWA." } | |||
35 | "Number of instruction converted to SDWA.")static llvm::Statistic NumSDWAInstructionsPeepholed = {"si-peephole-sdwa" , "NumSDWAInstructionsPeepholed", "Number of instruction converted to SDWA." }; | |||
36 | ||||
37 | namespace { | |||
38 | ||||
39 | class SDWAOperand; | |||
40 | class SDWADstOperand; | |||
41 | ||||
42 | class SIPeepholeSDWA : public MachineFunctionPass { | |||
43 | public: | |||
44 | using SDWAOperandsVector = SmallVector<SDWAOperand *, 4>; | |||
45 | ||||
46 | private: | |||
47 | MachineRegisterInfo *MRI; | |||
48 | const SIRegisterInfo *TRI; | |||
49 | const SIInstrInfo *TII; | |||
50 | ||||
51 | MapVector<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands; | |||
52 | MapVector<MachineInstr *, SDWAOperandsVector> PotentialMatches; | |||
53 | SmallVector<MachineInstr *, 8> ConvertedInstructions; | |||
54 | ||||
55 | Optional<int64_t> foldToImm(const MachineOperand &Op) const; | |||
56 | ||||
57 | public: | |||
58 | static char ID; | |||
59 | ||||
60 | SIPeepholeSDWA() : MachineFunctionPass(ID) { | |||
61 | initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry()); | |||
62 | } | |||
63 | ||||
64 | bool runOnMachineFunction(MachineFunction &MF) override; | |||
65 | void matchSDWAOperands(MachineBasicBlock &MBB); | |||
66 | std::unique_ptr<SDWAOperand> matchSDWAOperand(MachineInstr &MI); | |||
67 | bool isConvertibleToSDWA(MachineInstr &MI, const GCNSubtarget &ST) const; | |||
68 | void pseudoOpConvertToVOP2(MachineInstr &MI, | |||
69 | const GCNSubtarget &ST) const; | |||
70 | bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands); | |||
71 | void legalizeScalarOperands(MachineInstr &MI, const GCNSubtarget &ST) const; | |||
72 | ||||
73 | StringRef getPassName() const override { return "SI Peephole SDWA"; } | |||
74 | ||||
75 | void getAnalysisUsage(AnalysisUsage &AU) const override { | |||
76 | AU.setPreservesCFG(); | |||
77 | MachineFunctionPass::getAnalysisUsage(AU); | |||
78 | } | |||
79 | }; | |||
80 | ||||
81 | class SDWAOperand { | |||
82 | private: | |||
83 | MachineOperand *Target; // Operand that would be used in converted instruction | |||
84 | MachineOperand *Replaced; // Operand that would be replace by Target | |||
85 | ||||
86 | public: | |||
87 | SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp) | |||
88 | : Target(TargetOp), Replaced(ReplacedOp) { | |||
89 | assert(Target->isReg())(static_cast<void> (0)); | |||
90 | assert(Replaced->isReg())(static_cast<void> (0)); | |||
91 | } | |||
92 | ||||
93 | virtual ~SDWAOperand() = default; | |||
94 | ||||
95 | virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0; | |||
96 | virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0; | |||
97 | ||||
98 | MachineOperand *getTargetOperand() const { return Target; } | |||
99 | MachineOperand *getReplacedOperand() const { return Replaced; } | |||
100 | MachineInstr *getParentInst() const { return Target->getParent(); } | |||
101 | ||||
102 | MachineRegisterInfo *getMRI() const { | |||
103 | return &getParentInst()->getParent()->getParent()->getRegInfo(); | |||
104 | } | |||
105 | ||||
106 | #if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP) | |||
107 | virtual void print(raw_ostream& OS) const = 0; | |||
108 | void dump() const { print(dbgs()); } | |||
109 | #endif | |||
110 | }; | |||
111 | ||||
112 | using namespace AMDGPU::SDWA; | |||
113 | ||||
114 | class SDWASrcOperand : public SDWAOperand { | |||
115 | private: | |||
116 | SdwaSel SrcSel; | |||
117 | bool Abs; | |||
118 | bool Neg; | |||
119 | bool Sext; | |||
120 | ||||
121 | public: | |||
122 | SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, | |||
123 | SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false, | |||
124 | bool Sext_ = false) | |||
125 | : SDWAOperand(TargetOp, ReplacedOp), | |||
126 | SrcSel(SrcSel_), Abs(Abs_), Neg(Neg_), Sext(Sext_) {} | |||
127 | ||||
128 | MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; | |||
129 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; | |||
130 | ||||
131 | SdwaSel getSrcSel() const { return SrcSel; } | |||
132 | bool getAbs() const { return Abs; } | |||
133 | bool getNeg() const { return Neg; } | |||
134 | bool getSext() const { return Sext; } | |||
135 | ||||
136 | uint64_t getSrcMods(const SIInstrInfo *TII, | |||
137 | const MachineOperand *SrcOp) const; | |||
138 | ||||
139 | #if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP) | |||
140 | void print(raw_ostream& OS) const override; | |||
141 | #endif | |||
142 | }; | |||
143 | ||||
144 | class SDWADstOperand : public SDWAOperand { | |||
145 | private: | |||
146 | SdwaSel DstSel; | |||
147 | DstUnused DstUn; | |||
148 | ||||
149 | public: | |||
150 | ||||
151 | SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, | |||
152 | SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD) | |||
153 | : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {} | |||
154 | ||||
155 | MachineInstr *potentialToConvert(const SIInstrInfo *TII) override; | |||
156 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; | |||
157 | ||||
158 | SdwaSel getDstSel() const { return DstSel; } | |||
159 | DstUnused getDstUnused() const { return DstUn; } | |||
160 | ||||
161 | #if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP) | |||
162 | void print(raw_ostream& OS) const override; | |||
163 | #endif | |||
164 | }; | |||
165 | ||||
166 | class SDWADstPreserveOperand : public SDWADstOperand { | |||
167 | private: | |||
168 | MachineOperand *Preserve; | |||
169 | ||||
170 | public: | |||
171 | SDWADstPreserveOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp, | |||
172 | MachineOperand *PreserveOp, SdwaSel DstSel_ = DWORD) | |||
173 | : SDWADstOperand(TargetOp, ReplacedOp, DstSel_, UNUSED_PRESERVE), | |||
174 | Preserve(PreserveOp) {} | |||
175 | ||||
176 | bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override; | |||
177 | ||||
178 | MachineOperand *getPreservedOperand() const { return Preserve; } | |||
179 | ||||
180 | #if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP) | |||
181 | void print(raw_ostream& OS) const override; | |||
182 | #endif | |||
183 | }; | |||
184 | ||||
185 | } // end anonymous namespace | |||
186 | ||||
187 | INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)static void *initializeSIPeepholeSDWAPassOnce(PassRegistry & Registry) { PassInfo *PI = new PassInfo( "SI Peephole SDWA", "si-peephole-sdwa" , &SIPeepholeSDWA::ID, PassInfo::NormalCtor_t(callDefaultCtor <SIPeepholeSDWA>), false, false); Registry.registerPass (*PI, true); return PI; } static llvm::once_flag InitializeSIPeepholeSDWAPassFlag ; void llvm::initializeSIPeepholeSDWAPass(PassRegistry &Registry ) { llvm::call_once(InitializeSIPeepholeSDWAPassFlag, initializeSIPeepholeSDWAPassOnce , std::ref(Registry)); } | |||
188 | ||||
189 | char SIPeepholeSDWA::ID = 0; | |||
190 | ||||
191 | char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID; | |||
192 | ||||
193 | FunctionPass *llvm::createSIPeepholeSDWAPass() { | |||
194 | return new SIPeepholeSDWA(); | |||
195 | } | |||
196 | ||||
197 | ||||
198 | #if !defined(NDEBUG1) || defined(LLVM_ENABLE_DUMP) | |||
199 | static raw_ostream& operator<<(raw_ostream &OS, SdwaSel Sel) { | |||
200 | switch(Sel) { | |||
201 | case BYTE_0: OS << "BYTE_0"; break; | |||
202 | case BYTE_1: OS << "BYTE_1"; break; | |||
203 | case BYTE_2: OS << "BYTE_2"; break; | |||
204 | case BYTE_3: OS << "BYTE_3"; break; | |||
205 | case WORD_0: OS << "WORD_0"; break; | |||
206 | case WORD_1: OS << "WORD_1"; break; | |||
207 | case DWORD: OS << "DWORD"; break; | |||
208 | } | |||
209 | return OS; | |||
210 | } | |||
211 | ||||
212 | static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) { | |||
213 | switch(Un) { | |||
214 | case UNUSED_PAD: OS << "UNUSED_PAD"; break; | |||
215 | case UNUSED_SEXT: OS << "UNUSED_SEXT"; break; | |||
216 | case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break; | |||
217 | } | |||
218 | return OS; | |||
219 | } | |||
220 | ||||
221 | LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) | |||
222 | void SDWASrcOperand::print(raw_ostream& OS) const { | |||
223 | OS << "SDWA src: " << *getTargetOperand() | |||
224 | << " src_sel:" << getSrcSel() | |||
225 | << " abs:" << getAbs() << " neg:" << getNeg() | |||
226 | << " sext:" << getSext() << '\n'; | |||
227 | } | |||
228 | ||||
229 | LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) | |||
230 | void SDWADstOperand::print(raw_ostream& OS) const { | |||
231 | OS << "SDWA dst: " << *getTargetOperand() | |||
232 | << " dst_sel:" << getDstSel() | |||
233 | << " dst_unused:" << getDstUnused() << '\n'; | |||
234 | } | |||
235 | ||||
236 | LLVM_DUMP_METHOD__attribute__((noinline)) __attribute__((__used__)) | |||
237 | void SDWADstPreserveOperand::print(raw_ostream& OS) const { | |||
238 | OS << "SDWA preserve dst: " << *getTargetOperand() | |||
239 | << " dst_sel:" << getDstSel() | |||
240 | << " preserve:" << *getPreservedOperand() << '\n'; | |||
241 | } | |||
242 | ||||
243 | #endif | |||
244 | ||||
245 | static void copyRegOperand(MachineOperand &To, const MachineOperand &From) { | |||
246 | assert(To.isReg() && From.isReg())(static_cast<void> (0)); | |||
247 | To.setReg(From.getReg()); | |||
248 | To.setSubReg(From.getSubReg()); | |||
249 | To.setIsUndef(From.isUndef()); | |||
250 | if (To.isUse()) { | |||
251 | To.setIsKill(From.isKill()); | |||
252 | } else { | |||
253 | To.setIsDead(From.isDead()); | |||
254 | } | |||
255 | } | |||
256 | ||||
257 | static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) { | |||
258 | return LHS.isReg() && | |||
259 | RHS.isReg() && | |||
260 | LHS.getReg() == RHS.getReg() && | |||
261 | LHS.getSubReg() == RHS.getSubReg(); | |||
262 | } | |||
263 | ||||
264 | static MachineOperand *findSingleRegUse(const MachineOperand *Reg, | |||
265 | const MachineRegisterInfo *MRI) { | |||
266 | if (!Reg->isReg() || !Reg->isDef()) | |||
267 | return nullptr; | |||
268 | ||||
269 | MachineOperand *ResMO = nullptr; | |||
270 | for (MachineOperand &UseMO : MRI->use_nodbg_operands(Reg->getReg())) { | |||
271 | // If there exist use of subreg of Reg then return nullptr | |||
272 | if (!isSameReg(UseMO, *Reg)) | |||
273 | return nullptr; | |||
274 | ||||
275 | // Check that there is only one instruction that uses Reg | |||
276 | if (!ResMO) { | |||
277 | ResMO = &UseMO; | |||
278 | } else if (ResMO->getParent() != UseMO.getParent()) { | |||
279 | return nullptr; | |||
280 | } | |||
281 | } | |||
282 | ||||
283 | return ResMO; | |||
284 | } | |||
285 | ||||
286 | static MachineOperand *findSingleRegDef(const MachineOperand *Reg, | |||
287 | const MachineRegisterInfo *MRI) { | |||
288 | if (!Reg->isReg()) | |||
289 | return nullptr; | |||
290 | ||||
291 | MachineInstr *DefInstr = MRI->getUniqueVRegDef(Reg->getReg()); | |||
292 | if (!DefInstr) | |||
293 | return nullptr; | |||
294 | ||||
295 | for (auto &DefMO : DefInstr->defs()) { | |||
296 | if (DefMO.isReg() && DefMO.getReg() == Reg->getReg()) | |||
297 | return &DefMO; | |||
298 | } | |||
299 | ||||
300 | // Ignore implicit defs. | |||
301 | return nullptr; | |||
302 | } | |||
303 | ||||
304 | uint64_t SDWASrcOperand::getSrcMods(const SIInstrInfo *TII, | |||
305 | const MachineOperand *SrcOp) const { | |||
306 | uint64_t Mods = 0; | |||
307 | const auto *MI = SrcOp->getParent(); | |||
308 | if (TII->getNamedOperand(*MI, AMDGPU::OpName::src0) == SrcOp) { | |||
309 | if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { | |||
310 | Mods = Mod->getImm(); | |||
311 | } | |||
312 | } else if (TII->getNamedOperand(*MI, AMDGPU::OpName::src1) == SrcOp) { | |||
313 | if (auto *Mod = TII->getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers)) { | |||
314 | Mods = Mod->getImm(); | |||
315 | } | |||
316 | } | |||
317 | if (Abs || Neg) { | |||
318 | assert(!Sext &&(static_cast<void> (0)) | |||
319 | "Float and integer src modifiers can't be set simulteniously")(static_cast<void> (0)); | |||
320 | Mods |= Abs ? SISrcMods::ABS : 0u; | |||
321 | Mods ^= Neg ? SISrcMods::NEG : 0u; | |||
322 | } else if (Sext) { | |||
323 | Mods |= SISrcMods::SEXT; | |||
324 | } | |||
325 | ||||
326 | return Mods; | |||
327 | } | |||
328 | ||||
329 | MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) { | |||
330 | // For SDWA src operand potential instruction is one that use register | |||
331 | // defined by parent instruction | |||
332 | MachineOperand *PotentialMO = findSingleRegUse(getReplacedOperand(), getMRI()); | |||
333 | if (!PotentialMO) | |||
334 | return nullptr; | |||
335 | ||||
336 | return PotentialMO->getParent(); | |||
337 | } | |||
338 | ||||
339 | bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { | |||
340 | // Find operand in instruction that matches source operand and replace it with | |||
341 | // target operand. Set corresponding src_sel | |||
342 | bool IsPreserveSrc = false; | |||
343 | MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
344 | MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); | |||
345 | MachineOperand *SrcMods = | |||
346 | TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); | |||
347 | assert(Src && (Src->isReg() || Src->isImm()))(static_cast<void> (0)); | |||
348 | if (!isSameReg(*Src, *getReplacedOperand())) { | |||
| ||||
349 | // If this is not src0 then it could be src1 | |||
350 | Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
351 | SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); | |||
352 | SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); | |||
353 | ||||
354 | if (!Src || | |||
355 | !isSameReg(*Src, *getReplacedOperand())) { | |||
356 | // It's possible this Src is a tied operand for | |||
357 | // UNUSED_PRESERVE, in which case we can either | |||
358 | // abandon the peephole attempt, or if legal we can | |||
359 | // copy the target operand into the tied slot | |||
360 | // if the preserve operation will effectively cause the same | |||
361 | // result by overwriting the rest of the dst. | |||
362 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
363 | MachineOperand *DstUnused = | |||
364 | TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
365 | ||||
366 | if (Dst && | |||
367 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { | |||
368 | // This will work if the tied src is acessing WORD_0, and the dst is | |||
369 | // writing WORD_1. Modifiers don't matter because all the bits that | |||
370 | // would be impacted are being overwritten by the dst. | |||
371 | // Any other case will not work. | |||
372 | SdwaSel DstSel = static_cast<SdwaSel>( | |||
373 | TII->getNamedImmOperand(MI, AMDGPU::OpName::dst_sel)); | |||
374 | if (DstSel == AMDGPU::SDWA::SdwaSel::WORD_1 && | |||
375 | getSrcSel() == AMDGPU::SDWA::SdwaSel::WORD_0) { | |||
376 | IsPreserveSrc = true; | |||
377 | auto DstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), | |||
378 | AMDGPU::OpName::vdst); | |||
379 | auto TiedIdx = MI.findTiedOperandIdx(DstIdx); | |||
380 | Src = &MI.getOperand(TiedIdx); | |||
381 | SrcSel = nullptr; | |||
382 | SrcMods = nullptr; | |||
383 | } else { | |||
384 | // Not legal to convert this src | |||
385 | return false; | |||
386 | } | |||
387 | } | |||
388 | } | |||
389 | assert(Src && Src->isReg())(static_cast<void> (0)); | |||
390 | ||||
391 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || | |||
392 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || | |||
393 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || | |||
394 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && | |||
395 | !isSameReg(*Src, *getReplacedOperand())) { | |||
396 | // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to | |||
397 | // src2. This is not allowed. | |||
398 | return false; | |||
399 | } | |||
400 | ||||
401 | assert(isSameReg(*Src, *getReplacedOperand()) &&(static_cast<void> (0)) | |||
402 | (IsPreserveSrc || (SrcSel && SrcMods)))(static_cast<void> (0)); | |||
403 | } | |||
404 | copyRegOperand(*Src, *getTargetOperand()); | |||
| ||||
405 | if (!IsPreserveSrc) { | |||
406 | SrcSel->setImm(getSrcSel()); | |||
407 | SrcMods->setImm(getSrcMods(TII, Src)); | |||
408 | } | |||
409 | getTargetOperand()->setIsKill(false); | |||
410 | return true; | |||
411 | } | |||
412 | ||||
413 | MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) { | |||
414 | // For SDWA dst operand potential instruction is one that defines register | |||
415 | // that this operand uses | |||
416 | MachineRegisterInfo *MRI = getMRI(); | |||
417 | MachineInstr *ParentMI = getParentInst(); | |||
418 | ||||
419 | MachineOperand *PotentialMO = findSingleRegDef(getReplacedOperand(), MRI); | |||
420 | if (!PotentialMO) | |||
421 | return nullptr; | |||
422 | ||||
423 | // Check that ParentMI is the only instruction that uses replaced register | |||
424 | for (MachineInstr &UseInst : MRI->use_nodbg_instructions(PotentialMO->getReg())) { | |||
425 | if (&UseInst != ParentMI) | |||
426 | return nullptr; | |||
427 | } | |||
428 | ||||
429 | return PotentialMO->getParent(); | |||
430 | } | |||
431 | ||||
432 | bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) { | |||
433 | // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused | |||
434 | ||||
435 | if ((MI.getOpcode() == AMDGPU::V_FMAC_F16_sdwa || | |||
436 | MI.getOpcode() == AMDGPU::V_FMAC_F32_sdwa || | |||
437 | MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa || | |||
438 | MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) && | |||
439 | getDstSel() != AMDGPU::SDWA::DWORD) { | |||
440 | // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD | |||
441 | return false; | |||
442 | } | |||
443 | ||||
444 | MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
445 | assert(Operand &&(static_cast<void> (0)) | |||
446 | Operand->isReg() &&(static_cast<void> (0)) | |||
447 | isSameReg(*Operand, *getReplacedOperand()))(static_cast<void> (0)); | |||
448 | copyRegOperand(*Operand, *getTargetOperand()); | |||
449 | MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); | |||
450 | assert(DstSel)(static_cast<void> (0)); | |||
451 | DstSel->setImm(getDstSel()); | |||
452 | MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
453 | assert(DstUnused)(static_cast<void> (0)); | |||
454 | DstUnused->setImm(getDstUnused()); | |||
455 | ||||
456 | // Remove original instruction because it would conflict with our new | |||
457 | // instruction by register definition | |||
458 | getParentInst()->eraseFromParent(); | |||
459 | return true; | |||
460 | } | |||
461 | ||||
462 | bool SDWADstPreserveOperand::convertToSDWA(MachineInstr &MI, | |||
463 | const SIInstrInfo *TII) { | |||
464 | // MI should be moved right before v_or_b32. | |||
465 | // For this we should clear all kill flags on uses of MI src-operands or else | |||
466 | // we can encounter problem with use of killed operand. | |||
467 | for (MachineOperand &MO : MI.uses()) { | |||
468 | if (!MO.isReg()) | |||
469 | continue; | |||
470 | getMRI()->clearKillFlags(MO.getReg()); | |||
471 | } | |||
472 | ||||
473 | // Move MI before v_or_b32 | |||
474 | auto MBB = MI.getParent(); | |||
475 | MBB->remove(&MI); | |||
476 | MBB->insert(getParentInst(), &MI); | |||
477 | ||||
478 | // Add Implicit use of preserved register | |||
479 | MachineInstrBuilder MIB(*MBB->getParent(), MI); | |||
480 | MIB.addReg(getPreservedOperand()->getReg(), | |||
481 | RegState::ImplicitKill, | |||
482 | getPreservedOperand()->getSubReg()); | |||
483 | ||||
484 | // Tie dst to implicit use | |||
485 | MI.tieOperands(AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst), | |||
486 | MI.getNumOperands() - 1); | |||
487 | ||||
488 | // Convert MI as any other SDWADstOperand and remove v_or_b32 | |||
489 | return SDWADstOperand::convertToSDWA(MI, TII); | |||
490 | } | |||
491 | ||||
492 | Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const { | |||
493 | if (Op.isImm()) { | |||
494 | return Op.getImm(); | |||
495 | } | |||
496 | ||||
497 | // If this is not immediate then it can be copy of immediate value, e.g.: | |||
498 | // %1 = S_MOV_B32 255; | |||
499 | if (Op.isReg()) { | |||
500 | for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) { | |||
501 | if (!isSameReg(Op, Def)) | |||
502 | continue; | |||
503 | ||||
504 | const MachineInstr *DefInst = Def.getParent(); | |||
505 | if (!TII->isFoldableCopy(*DefInst)) | |||
506 | return None; | |||
507 | ||||
508 | const MachineOperand &Copied = DefInst->getOperand(1); | |||
509 | if (!Copied.isImm()) | |||
510 | return None; | |||
511 | ||||
512 | return Copied.getImm(); | |||
513 | } | |||
514 | } | |||
515 | ||||
516 | return None; | |||
517 | } | |||
518 | ||||
519 | std::unique_ptr<SDWAOperand> | |||
520 | SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) { | |||
521 | unsigned Opcode = MI.getOpcode(); | |||
522 | switch (Opcode) { | |||
523 | case AMDGPU::V_LSHRREV_B32_e32: | |||
524 | case AMDGPU::V_ASHRREV_I32_e32: | |||
525 | case AMDGPU::V_LSHLREV_B32_e32: | |||
526 | case AMDGPU::V_LSHRREV_B32_e64: | |||
527 | case AMDGPU::V_ASHRREV_I32_e64: | |||
528 | case AMDGPU::V_LSHLREV_B32_e64: { | |||
529 | // from: v_lshrrev_b32_e32 v1, 16/24, v0 | |||
530 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 | |||
531 | ||||
532 | // from: v_ashrrev_i32_e32 v1, 16/24, v0 | |||
533 | // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1 | |||
534 | ||||
535 | // from: v_lshlrev_b32_e32 v1, 16/24, v0 | |||
536 | // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD | |||
537 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
538 | auto Imm = foldToImm(*Src0); | |||
539 | if (!Imm) | |||
540 | break; | |||
541 | ||||
542 | if (*Imm != 16 && *Imm != 24) | |||
543 | break; | |||
544 | ||||
545 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
546 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
547 | if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) | |||
548 | break; | |||
549 | ||||
550 | if (Opcode == AMDGPU::V_LSHLREV_B32_e32 || | |||
551 | Opcode == AMDGPU::V_LSHLREV_B32_e64) { | |||
552 | return std::make_unique<SDWADstOperand>( | |||
553 | Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD); | |||
554 | } else { | |||
555 | return std::make_unique<SDWASrcOperand>( | |||
556 | Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false, | |||
557 | Opcode != AMDGPU::V_LSHRREV_B32_e32 && | |||
558 | Opcode != AMDGPU::V_LSHRREV_B32_e64); | |||
559 | } | |||
560 | break; | |||
561 | } | |||
562 | ||||
563 | case AMDGPU::V_LSHRREV_B16_e32: | |||
564 | case AMDGPU::V_ASHRREV_I16_e32: | |||
565 | case AMDGPU::V_LSHLREV_B16_e32: | |||
566 | case AMDGPU::V_LSHRREV_B16_e64: | |||
567 | case AMDGPU::V_ASHRREV_I16_e64: | |||
568 | case AMDGPU::V_LSHLREV_B16_e64: { | |||
569 | // from: v_lshrrev_b16_e32 v1, 8, v0 | |||
570 | // to SDWA src:v0 src_sel:BYTE_1 | |||
571 | ||||
572 | // from: v_ashrrev_i16_e32 v1, 8, v0 | |||
573 | // to SDWA src:v0 src_sel:BYTE_1 sext:1 | |||
574 | ||||
575 | // from: v_lshlrev_b16_e32 v1, 8, v0 | |||
576 | // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD | |||
577 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
578 | auto Imm = foldToImm(*Src0); | |||
579 | if (!Imm || *Imm != 8) | |||
580 | break; | |||
581 | ||||
582 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
583 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
584 | ||||
585 | if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical()) | |||
586 | break; | |||
587 | ||||
588 | if (Opcode == AMDGPU::V_LSHLREV_B16_e32 || | |||
589 | Opcode == AMDGPU::V_LSHLREV_B16_e64) { | |||
590 | return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD); | |||
591 | } else { | |||
592 | return std::make_unique<SDWASrcOperand>( | |||
593 | Src1, Dst, BYTE_1, false, false, | |||
594 | Opcode != AMDGPU::V_LSHRREV_B16_e32 && | |||
595 | Opcode != AMDGPU::V_LSHRREV_B16_e64); | |||
596 | } | |||
597 | break; | |||
598 | } | |||
599 | ||||
600 | case AMDGPU::V_BFE_I32_e64: | |||
601 | case AMDGPU::V_BFE_U32_e64: { | |||
602 | // e.g.: | |||
603 | // from: v_bfe_u32 v1, v0, 8, 8 | |||
604 | // to SDWA src:v0 src_sel:BYTE_1 | |||
605 | ||||
606 | // offset | width | src_sel | |||
607 | // ------------------------ | |||
608 | // 0 | 8 | BYTE_0 | |||
609 | // 0 | 16 | WORD_0 | |||
610 | // 0 | 32 | DWORD ? | |||
611 | // 8 | 8 | BYTE_1 | |||
612 | // 16 | 8 | BYTE_2 | |||
613 | // 16 | 16 | WORD_1 | |||
614 | // 24 | 8 | BYTE_3 | |||
615 | ||||
616 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
617 | auto Offset = foldToImm(*Src1); | |||
618 | if (!Offset) | |||
619 | break; | |||
620 | ||||
621 | MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); | |||
622 | auto Width = foldToImm(*Src2); | |||
623 | if (!Width) | |||
624 | break; | |||
625 | ||||
626 | SdwaSel SrcSel = DWORD; | |||
627 | ||||
628 | if (*Offset == 0 && *Width == 8) | |||
629 | SrcSel = BYTE_0; | |||
630 | else if (*Offset == 0 && *Width == 16) | |||
631 | SrcSel = WORD_0; | |||
632 | else if (*Offset == 0 && *Width == 32) | |||
633 | SrcSel = DWORD; | |||
634 | else if (*Offset == 8 && *Width == 8) | |||
635 | SrcSel = BYTE_1; | |||
636 | else if (*Offset == 16 && *Width == 8) | |||
637 | SrcSel = BYTE_2; | |||
638 | else if (*Offset == 16 && *Width == 16) | |||
639 | SrcSel = WORD_1; | |||
640 | else if (*Offset == 24 && *Width == 8) | |||
641 | SrcSel = BYTE_3; | |||
642 | else | |||
643 | break; | |||
644 | ||||
645 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
646 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
647 | ||||
648 | if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical()) | |||
649 | break; | |||
650 | ||||
651 | return std::make_unique<SDWASrcOperand>( | |||
652 | Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32_e64); | |||
653 | } | |||
654 | ||||
655 | case AMDGPU::V_AND_B32_e32: | |||
656 | case AMDGPU::V_AND_B32_e64: { | |||
657 | // e.g.: | |||
658 | // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0 | |||
659 | // to SDWA src:v0 src_sel:WORD_0/BYTE_0 | |||
660 | ||||
661 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
662 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
663 | auto ValSrc = Src1; | |||
664 | auto Imm = foldToImm(*Src0); | |||
665 | ||||
666 | if (!Imm) { | |||
667 | Imm = foldToImm(*Src1); | |||
668 | ValSrc = Src0; | |||
669 | } | |||
670 | ||||
671 | if (!Imm || (*Imm != 0x0000ffff && *Imm != 0x000000ff)) | |||
672 | break; | |||
673 | ||||
674 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
675 | ||||
676 | if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical()) | |||
677 | break; | |||
678 | ||||
679 | return std::make_unique<SDWASrcOperand>( | |||
680 | ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0); | |||
681 | } | |||
682 | ||||
683 | case AMDGPU::V_OR_B32_e32: | |||
684 | case AMDGPU::V_OR_B32_e64: { | |||
685 | // Patterns for dst_unused:UNUSED_PRESERVE. | |||
686 | // e.g., from: | |||
687 | // v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD | |||
688 | // src1_sel:WORD_1 src2_sel:WORD1 | |||
689 | // v_add_f16_e32 v3, v1, v2 | |||
690 | // v_or_b32_e32 v4, v0, v3 | |||
691 | // to SDWA preserve dst:v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE preserve:v3 | |||
692 | ||||
693 | // Check if one of operands of v_or_b32 is SDWA instruction | |||
694 | using CheckRetType = Optional<std::pair<MachineOperand *, MachineOperand *>>; | |||
695 | auto CheckOROperandsForSDWA = | |||
696 | [&](const MachineOperand *Op1, const MachineOperand *Op2) -> CheckRetType { | |||
697 | if (!Op1 || !Op1->isReg() || !Op2 || !Op2->isReg()) | |||
698 | return CheckRetType(None); | |||
699 | ||||
700 | MachineOperand *Op1Def = findSingleRegDef(Op1, MRI); | |||
701 | if (!Op1Def) | |||
702 | return CheckRetType(None); | |||
703 | ||||
704 | MachineInstr *Op1Inst = Op1Def->getParent(); | |||
705 | if (!TII->isSDWA(*Op1Inst)) | |||
706 | return CheckRetType(None); | |||
707 | ||||
708 | MachineOperand *Op2Def = findSingleRegDef(Op2, MRI); | |||
709 | if (!Op2Def) | |||
710 | return CheckRetType(None); | |||
711 | ||||
712 | return CheckRetType(std::make_pair(Op1Def, Op2Def)); | |||
713 | }; | |||
714 | ||||
715 | MachineOperand *OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
716 | MachineOperand *OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
717 | assert(OrSDWA && OrOther)(static_cast<void> (0)); | |||
718 | auto Res = CheckOROperandsForSDWA(OrSDWA, OrOther); | |||
719 | if (!Res) { | |||
720 | OrSDWA = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
721 | OrOther = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
722 | assert(OrSDWA && OrOther)(static_cast<void> (0)); | |||
723 | Res = CheckOROperandsForSDWA(OrSDWA, OrOther); | |||
724 | if (!Res) | |||
725 | break; | |||
726 | } | |||
727 | ||||
728 | MachineOperand *OrSDWADef = Res->first; | |||
729 | MachineOperand *OrOtherDef = Res->second; | |||
730 | assert(OrSDWADef && OrOtherDef)(static_cast<void> (0)); | |||
731 | ||||
732 | MachineInstr *SDWAInst = OrSDWADef->getParent(); | |||
733 | MachineInstr *OtherInst = OrOtherDef->getParent(); | |||
734 | ||||
735 | // Check that OtherInstr is actually bitwise compatible with SDWAInst = their | |||
736 | // destination patterns don't overlap. Compatible instruction can be either | |||
737 | // regular instruction with compatible bitness or SDWA instruction with | |||
738 | // correct dst_sel | |||
739 | // SDWAInst | OtherInst bitness / OtherInst dst_sel | |||
740 | // ----------------------------------------------------- | |||
741 | // DWORD | no / no | |||
742 | // WORD_0 | no / BYTE_2/3, WORD_1 | |||
743 | // WORD_1 | 8/16-bit instructions / BYTE_0/1, WORD_0 | |||
744 | // BYTE_0 | no / BYTE_1/2/3, WORD_1 | |||
745 | // BYTE_1 | 8-bit / BYTE_0/2/3, WORD_1 | |||
746 | // BYTE_2 | 8/16-bit / BYTE_0/1/3. WORD_0 | |||
747 | // BYTE_3 | 8/16/24-bit / BYTE_0/1/2, WORD_0 | |||
748 | // E.g. if SDWAInst is v_add_f16_sdwa dst_sel:WORD_1 then v_add_f16 is OK | |||
749 | // but v_add_f32 is not. | |||
750 | ||||
751 | // TODO: add support for non-SDWA instructions as OtherInst. | |||
752 | // For now this only works with SDWA instructions. For regular instructions | |||
753 | // there is no way to determine if the instruction writes only 8/16/24-bit | |||
754 | // out of full register size and all registers are at min 32-bit wide. | |||
755 | if (!TII->isSDWA(*OtherInst)) | |||
756 | break; | |||
757 | ||||
758 | SdwaSel DstSel = static_cast<SdwaSel>( | |||
759 | TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));; | |||
760 | SdwaSel OtherDstSel = static_cast<SdwaSel>( | |||
761 | TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel)); | |||
762 | ||||
763 | bool DstSelAgree = false; | |||
764 | switch (DstSel) { | |||
765 | case WORD_0: DstSelAgree = ((OtherDstSel == BYTE_2) || | |||
766 | (OtherDstSel == BYTE_3) || | |||
767 | (OtherDstSel == WORD_1)); | |||
768 | break; | |||
769 | case WORD_1: DstSelAgree = ((OtherDstSel == BYTE_0) || | |||
770 | (OtherDstSel == BYTE_1) || | |||
771 | (OtherDstSel == WORD_0)); | |||
772 | break; | |||
773 | case BYTE_0: DstSelAgree = ((OtherDstSel == BYTE_1) || | |||
774 | (OtherDstSel == BYTE_2) || | |||
775 | (OtherDstSel == BYTE_3) || | |||
776 | (OtherDstSel == WORD_1)); | |||
777 | break; | |||
778 | case BYTE_1: DstSelAgree = ((OtherDstSel == BYTE_0) || | |||
779 | (OtherDstSel == BYTE_2) || | |||
780 | (OtherDstSel == BYTE_3) || | |||
781 | (OtherDstSel == WORD_1)); | |||
782 | break; | |||
783 | case BYTE_2: DstSelAgree = ((OtherDstSel == BYTE_0) || | |||
784 | (OtherDstSel == BYTE_1) || | |||
785 | (OtherDstSel == BYTE_3) || | |||
786 | (OtherDstSel == WORD_0)); | |||
787 | break; | |||
788 | case BYTE_3: DstSelAgree = ((OtherDstSel == BYTE_0) || | |||
789 | (OtherDstSel == BYTE_1) || | |||
790 | (OtherDstSel == BYTE_2) || | |||
791 | (OtherDstSel == WORD_0)); | |||
792 | break; | |||
793 | default: DstSelAgree = false; | |||
794 | } | |||
795 | ||||
796 | if (!DstSelAgree) | |||
797 | break; | |||
798 | ||||
799 | // Also OtherInst dst_unused should be UNUSED_PAD | |||
800 | DstUnused OtherDstUnused = static_cast<DstUnused>( | |||
801 | TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_unused)); | |||
802 | if (OtherDstUnused != DstUnused::UNUSED_PAD) | |||
803 | break; | |||
804 | ||||
805 | // Create DstPreserveOperand | |||
806 | MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
807 | assert(OrDst && OrDst->isReg())(static_cast<void> (0)); | |||
808 | ||||
809 | return std::make_unique<SDWADstPreserveOperand>( | |||
810 | OrDst, OrSDWADef, OrOtherDef, DstSel); | |||
811 | ||||
812 | } | |||
813 | } | |||
814 | ||||
815 | return std::unique_ptr<SDWAOperand>(nullptr); | |||
816 | } | |||
817 | ||||
818 | #if !defined(NDEBUG1) | |||
819 | static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) { | |||
820 | Operand.print(OS); | |||
821 | return OS; | |||
822 | } | |||
823 | #endif | |||
824 | ||||
825 | void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) { | |||
826 | for (MachineInstr &MI : MBB) { | |||
827 | if (auto Operand = matchSDWAOperand(MI)) { | |||
828 | LLVM_DEBUG(dbgs() << "Match: " << MI << "To: " << *Operand << '\n')do { } while (false); | |||
829 | SDWAOperands[&MI] = std::move(Operand); | |||
830 | ++NumSDWAPatternsFound; | |||
831 | } | |||
832 | } | |||
833 | } | |||
834 | ||||
835 | // Convert the V_ADDC_U32_e64 into V_ADDC_U32_e32, and | |||
836 | // V_ADD_CO_U32_e64 into V_ADD_CO_U32_e32. This allows isConvertibleToSDWA | |||
837 | // to perform its transformation on V_ADD_CO_U32_e32 into V_ADD_CO_U32_sdwa. | |||
838 | // | |||
839 | // We are transforming from a VOP3 into a VOP2 form of the instruction. | |||
840 | // %19:vgpr_32 = V_AND_B32_e32 255, | |||
841 | // killed %16:vgpr_32, implicit $exec | |||
842 | // %47:vgpr_32, %49:sreg_64_xexec = V_ADD_CO_U32_e64 | |||
843 | // %26.sub0:vreg_64, %19:vgpr_32, implicit $exec | |||
844 | // %48:vgpr_32, dead %50:sreg_64_xexec = V_ADDC_U32_e64 | |||
845 | // %26.sub1:vreg_64, %54:vgpr_32, killed %49:sreg_64_xexec, implicit $exec | |||
846 | // | |||
847 | // becomes | |||
848 | // %47:vgpr_32 = V_ADD_CO_U32_sdwa | |||
849 | // 0, %26.sub0:vreg_64, 0, killed %16:vgpr_32, 0, 6, 0, 6, 0, | |||
850 | // implicit-def $vcc, implicit $exec | |||
851 | // %48:vgpr_32 = V_ADDC_U32_e32 | |||
852 | // 0, %26.sub1:vreg_64, implicit-def $vcc, implicit $vcc, implicit $exec | |||
853 | void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI, | |||
854 | const GCNSubtarget &ST) const { | |||
855 | int Opc = MI.getOpcode(); | |||
856 | assert((Opc == AMDGPU::V_ADD_CO_U32_e64 || Opc == AMDGPU::V_SUB_CO_U32_e64) &&(static_cast<void> (0)) | |||
857 | "Currently only handles V_ADD_CO_U32_e64 or V_SUB_CO_U32_e64")(static_cast<void> (0)); | |||
858 | ||||
859 | // Can the candidate MI be shrunk? | |||
860 | if (!TII->canShrink(MI, *MRI)) | |||
861 | return; | |||
862 | Opc = AMDGPU::getVOPe32(Opc); | |||
863 | // Find the related ADD instruction. | |||
864 | const MachineOperand *Sdst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); | |||
865 | if (!Sdst) | |||
866 | return; | |||
867 | MachineOperand *NextOp = findSingleRegUse(Sdst, MRI); | |||
868 | if (!NextOp) | |||
869 | return; | |||
870 | MachineInstr &MISucc = *NextOp->getParent(); | |||
871 | // Can the successor be shrunk? | |||
872 | if (!TII->canShrink(MISucc, *MRI)) | |||
873 | return; | |||
874 | int SuccOpc = AMDGPU::getVOPe32(MISucc.getOpcode()); | |||
875 | // Make sure the carry in/out are subsequently unused. | |||
876 | MachineOperand *CarryIn = TII->getNamedOperand(MISucc, AMDGPU::OpName::src2); | |||
877 | if (!CarryIn) | |||
878 | return; | |||
879 | MachineOperand *CarryOut = TII->getNamedOperand(MISucc, AMDGPU::OpName::sdst); | |||
880 | if (!CarryOut) | |||
881 | return; | |||
882 | if (!MRI->hasOneUse(CarryIn->getReg()) || !MRI->use_empty(CarryOut->getReg())) | |||
883 | return; | |||
884 | // Make sure VCC or its subregs are dead before MI. | |||
885 | MachineBasicBlock &MBB = *MI.getParent(); | |||
886 | auto Liveness = MBB.computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 25); | |||
887 | if (Liveness != MachineBasicBlock::LQR_Dead) | |||
888 | return; | |||
889 | // Check if VCC is referenced in range of (MI,MISucc]. | |||
890 | for (auto I = std::next(MI.getIterator()), E = MISucc.getIterator(); | |||
891 | I != E; ++I) { | |||
892 | if (I->modifiesRegister(AMDGPU::VCC, TRI)) | |||
893 | return; | |||
894 | } | |||
895 | ||||
896 | // Make the two new e32 instruction variants. | |||
897 | // Replace MI with V_{SUB|ADD}_I32_e32 | |||
898 | BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc)) | |||
899 | .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) | |||
900 | .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0)) | |||
901 | .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1)) | |||
902 | .setMIFlags(MI.getFlags()); | |||
903 | ||||
904 | MI.eraseFromParent(); | |||
905 | ||||
906 | // Replace MISucc with V_{SUBB|ADDC}_U32_e32 | |||
907 | BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc)) | |||
908 | .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst)) | |||
909 | .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0)) | |||
910 | .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1)) | |||
911 | .setMIFlags(MISucc.getFlags()); | |||
912 | ||||
913 | MISucc.eraseFromParent(); | |||
914 | } | |||
915 | ||||
916 | bool SIPeepholeSDWA::isConvertibleToSDWA(MachineInstr &MI, | |||
917 | const GCNSubtarget &ST) const { | |||
918 | // Check if this is already an SDWA instruction | |||
919 | unsigned Opc = MI.getOpcode(); | |||
920 | if (TII->isSDWA(Opc)) | |||
921 | return true; | |||
922 | ||||
923 | // Check if this instruction has opcode that supports SDWA | |||
924 | if (AMDGPU::getSDWAOp(Opc) == -1) | |||
925 | Opc = AMDGPU::getVOPe32(Opc); | |||
926 | ||||
927 | if (AMDGPU::getSDWAOp(Opc) == -1) | |||
928 | return false; | |||
929 | ||||
930 | if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) | |||
931 | return false; | |||
932 | ||||
933 | if (TII->isVOPC(Opc)) { | |||
934 | if (!ST.hasSDWASdst()) { | |||
935 | const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst); | |||
936 | if (SDst && (SDst->getReg() != AMDGPU::VCC && | |||
937 | SDst->getReg() != AMDGPU::VCC_LO)) | |||
938 | return false; | |||
939 | } | |||
940 | ||||
941 | if (!ST.hasSDWAOutModsVOPC() && | |||
942 | (TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) || | |||
943 | TII->hasModifiersSet(MI, AMDGPU::OpName::omod))) | |||
944 | return false; | |||
945 | ||||
946 | } else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst) || | |||
947 | !TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) { | |||
948 | return false; | |||
949 | } | |||
950 | ||||
951 | if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_FMAC_F16_e32 || | |||
952 | Opc == AMDGPU::V_FMAC_F32_e32 || | |||
953 | Opc == AMDGPU::V_MAC_F16_e32 || | |||
954 | Opc == AMDGPU::V_MAC_F32_e32)) | |||
955 | return false; | |||
956 | ||||
957 | // Check if target supports this SDWA opcode | |||
958 | if (TII->pseudoToMCOpcode(Opc) == -1) | |||
959 | return false; | |||
960 | ||||
961 | // FIXME: has SDWA but require handling of implicit VCC use | |||
962 | if (Opc == AMDGPU::V_CNDMASK_B32_e32) | |||
963 | return false; | |||
964 | ||||
965 | if (MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0)) { | |||
966 | if (!Src0->isReg() && !Src0->isImm()) | |||
967 | return false; | |||
968 | } | |||
969 | ||||
970 | if (MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1)) { | |||
971 | if (!Src1->isReg() && !Src1->isImm()) | |||
972 | return false; | |||
973 | } | |||
974 | ||||
975 | return true; | |||
976 | } | |||
977 | ||||
978 | bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI, | |||
979 | const SDWAOperandsVector &SDWAOperands) { | |||
980 | ||||
981 | LLVM_DEBUG(dbgs() << "Convert instruction:" << MI)do { } while (false); | |||
982 | ||||
983 | // Convert to sdwa | |||
984 | int SDWAOpcode; | |||
985 | unsigned Opcode = MI.getOpcode(); | |||
986 | if (TII->isSDWA(Opcode)) { | |||
987 | SDWAOpcode = Opcode; | |||
988 | } else { | |||
989 | SDWAOpcode = AMDGPU::getSDWAOp(Opcode); | |||
990 | if (SDWAOpcode == -1) | |||
991 | SDWAOpcode = AMDGPU::getSDWAOp(AMDGPU::getVOPe32(Opcode)); | |||
992 | } | |||
993 | assert(SDWAOpcode != -1)(static_cast<void> (0)); | |||
994 | ||||
995 | const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode); | |||
996 | ||||
997 | // Create SDWA version of instruction MI and initialize its operands | |||
998 | MachineInstrBuilder SDWAInst = | |||
999 | BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc) | |||
1000 | .setMIFlags(MI.getFlags()); | |||
1001 | ||||
1002 | // Copy dst, if it is present in original then should also be present in SDWA | |||
1003 | MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst); | |||
1004 | if (Dst) { | |||
1005 | assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1)(static_cast<void> (0)); | |||
1006 | SDWAInst.add(*Dst); | |||
1007 | } else if ((Dst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst))) { | |||
1008 | assert(Dst &&(static_cast<void> (0)) | |||
1009 | AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1)(static_cast<void> (0)); | |||
1010 | SDWAInst.add(*Dst); | |||
1011 | } else { | |||
1012 | assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::sdst) != -1)(static_cast<void> (0)); | |||
1013 | SDWAInst.addReg(TRI->getVCC(), RegState::Define); | |||
1014 | } | |||
1015 | ||||
1016 | // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and | |||
1017 | // src0_modifiers (except for v_nop_sdwa, but it can't get here) | |||
1018 | MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); | |||
1019 | assert((static_cast<void> (0)) | |||
1020 | Src0 &&(static_cast<void> (0)) | |||
1021 | AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&(static_cast<void> (0)) | |||
1022 | AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1)(static_cast<void> (0)); | |||
1023 | if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) | |||
1024 | SDWAInst.addImm(Mod->getImm()); | |||
1025 | else | |||
1026 | SDWAInst.addImm(0); | |||
1027 | SDWAInst.add(*Src0); | |||
1028 | ||||
1029 | // Copy src1 if present, initialize src1_modifiers. | |||
1030 | MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); | |||
1031 | if (Src1) { | |||
1032 | assert((static_cast<void> (0)) | |||
1033 | AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&(static_cast<void> (0)) | |||
1034 | AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1)(static_cast<void> (0)); | |||
1035 | if (auto *Mod = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)) | |||
1036 | SDWAInst.addImm(Mod->getImm()); | |||
1037 | else | |||
1038 | SDWAInst.addImm(0); | |||
1039 | SDWAInst.add(*Src1); | |||
1040 | } | |||
1041 | ||||
1042 | if (SDWAOpcode == AMDGPU::V_FMAC_F16_sdwa || | |||
1043 | SDWAOpcode == AMDGPU::V_FMAC_F32_sdwa || | |||
1044 | SDWAOpcode == AMDGPU::V_MAC_F16_sdwa || | |||
1045 | SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) { | |||
1046 | // v_mac_f16/32 has additional src2 operand tied to vdst | |||
1047 | MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2); | |||
1048 | assert(Src2)(static_cast<void> (0)); | |||
1049 | SDWAInst.add(*Src2); | |||
1050 | } | |||
1051 | ||||
1052 | // Copy clamp if present, initialize otherwise | |||
1053 | assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1)(static_cast<void> (0)); | |||
1054 | MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp); | |||
1055 | if (Clamp) { | |||
1056 | SDWAInst.add(*Clamp); | |||
1057 | } else { | |||
1058 | SDWAInst.addImm(0); | |||
1059 | } | |||
1060 | ||||
1061 | // Copy omod if present, initialize otherwise if needed | |||
1062 | if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) { | |||
1063 | MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod); | |||
1064 | if (OMod) { | |||
1065 | SDWAInst.add(*OMod); | |||
1066 | } else { | |||
1067 | SDWAInst.addImm(0); | |||
1068 | } | |||
1069 | } | |||
1070 | ||||
1071 | // Copy dst_sel if present, initialize otherwise if needed | |||
1072 | if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1) { | |||
1073 | MachineOperand *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel); | |||
1074 | if (DstSel) { | |||
1075 | SDWAInst.add(*DstSel); | |||
1076 | } else { | |||
1077 | SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); | |||
1078 | } | |||
1079 | } | |||
1080 | ||||
1081 | // Copy dst_unused if present, initialize otherwise if needed | |||
1082 | if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1) { | |||
1083 | MachineOperand *DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
1084 | if (DstUnused) { | |||
1085 | SDWAInst.add(*DstUnused); | |||
1086 | } else { | |||
1087 | SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD); | |||
1088 | } | |||
1089 | } | |||
1090 | ||||
1091 | // Copy src0_sel if present, initialize otherwise | |||
1092 | assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1)(static_cast<void> (0)); | |||
1093 | MachineOperand *Src0Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel); | |||
1094 | if (Src0Sel) { | |||
1095 | SDWAInst.add(*Src0Sel); | |||
1096 | } else { | |||
1097 | SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); | |||
1098 | } | |||
1099 | ||||
1100 | // Copy src1_sel if present, initialize otherwise if needed | |||
1101 | if (Src1) { | |||
1102 | assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1)(static_cast<void> (0)); | |||
1103 | MachineOperand *Src1Sel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel); | |||
1104 | if (Src1Sel) { | |||
1105 | SDWAInst.add(*Src1Sel); | |||
1106 | } else { | |||
1107 | SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD); | |||
1108 | } | |||
1109 | } | |||
1110 | ||||
1111 | // Check for a preserved register that needs to be copied. | |||
1112 | auto DstUnused = TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused); | |||
1113 | if (DstUnused && | |||
1114 | DstUnused->getImm() == AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) { | |||
1115 | // We expect, if we are here, that the instruction was already in it's SDWA form, | |||
1116 | // with a tied operand. | |||
1117 | assert(Dst && Dst->isTied())(static_cast<void> (0)); | |||
1118 | assert(Opcode == static_cast<unsigned int>(SDWAOpcode))(static_cast<void> (0)); | |||
1119 | // We also expect a vdst, since sdst can't preserve. | |||
1120 | auto PreserveDstIdx = AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst); | |||
1121 | assert(PreserveDstIdx != -1)(static_cast<void> (0)); | |||
1122 | ||||
1123 | auto TiedIdx = MI.findTiedOperandIdx(PreserveDstIdx); | |||
1124 | auto Tied = MI.getOperand(TiedIdx); | |||
1125 | ||||
1126 | SDWAInst.add(Tied); | |||
1127 | SDWAInst->tieOperands(PreserveDstIdx, SDWAInst->getNumOperands() - 1); | |||
1128 | } | |||
1129 | ||||
1130 | // Apply all sdwa operand patterns. | |||
1131 | bool Converted = false; | |||
1132 | for (auto &Operand : SDWAOperands) { | |||
1133 | LLVM_DEBUG(dbgs() << *SDWAInst << "\nOperand: " << *Operand)do { } while (false); | |||
1134 | // There should be no intesection between SDWA operands and potential MIs | |||
1135 | // e.g.: | |||
1136 | // v_and_b32 v0, 0xff, v1 -> src:v1 sel:BYTE_0 | |||
1137 | // v_and_b32 v2, 0xff, v0 -> src:v0 sel:BYTE_0 | |||
1138 | // v_add_u32 v3, v4, v2 | |||
1139 | // | |||
1140 | // In that example it is possible that we would fold 2nd instruction into 3rd | |||
1141 | // (v_add_u32_sdwa) and then try to fold 1st instruction into 2nd (that was | |||
1142 | // already destroyed). So if SDWAOperand is also a potential MI then do not | |||
1143 | // apply it. | |||
1144 | if (PotentialMatches.count(Operand->getParentInst()) == 0) | |||
1145 | Converted |= Operand->convertToSDWA(*SDWAInst, TII); | |||
1146 | } | |||
1147 | if (Converted) { | |||
1148 | ConvertedInstructions.push_back(SDWAInst); | |||
1149 | } else { | |||
1150 | SDWAInst->eraseFromParent(); | |||
1151 | return false; | |||
1152 | } | |||
1153 | ||||
1154 | LLVM_DEBUG(dbgs() << "\nInto:" << *SDWAInst << '\n')do { } while (false); | |||
1155 | ++NumSDWAInstructionsPeepholed; | |||
1156 | ||||
1157 | MI.eraseFromParent(); | |||
1158 | return true; | |||
1159 | } | |||
1160 | ||||
1161 | // If an instruction was converted to SDWA it should not have immediates or SGPR | |||
1162 | // operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs. | |||
1163 | void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, | |||
1164 | const GCNSubtarget &ST) const { | |||
1165 | const MCInstrDesc &Desc = TII->get(MI.getOpcode()); | |||
1166 | unsigned ConstantBusCount = 0; | |||
1167 | for (MachineOperand &Op : MI.explicit_uses()) { | |||
1168 | if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg()))) | |||
1169 | continue; | |||
1170 | ||||
1171 | unsigned I = MI.getOperandNo(&Op); | |||
1172 | if (Desc.OpInfo[I].RegClass == -1 || | |||
1173 | !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass))) | |||
1174 | continue; | |||
1175 | ||||
1176 | if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() && | |||
1177 | TRI->isSGPRReg(*MRI, Op.getReg())) { | |||
1178 | ++ConstantBusCount; | |||
1179 | continue; | |||
1180 | } | |||
1181 | ||||
1182 | Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass); | |||
1183 | auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), | |||
1184 | TII->get(AMDGPU::V_MOV_B32_e32), VGPR); | |||
1185 | if (Op.isImm()) | |||
1186 | Copy.addImm(Op.getImm()); | |||
1187 | else if (Op.isReg()) | |||
1188 | Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0, | |||
1189 | Op.getSubReg()); | |||
1190 | Op.ChangeToRegister(VGPR, false); | |||
1191 | } | |||
1192 | } | |||
1193 | ||||
1194 | bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) { | |||
1195 | const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | |||
1196 | ||||
1197 | if (!ST.hasSDWA() || skipFunction(MF.getFunction())) | |||
1198 | return false; | |||
1199 | ||||
1200 | MRI = &MF.getRegInfo(); | |||
1201 | TRI = ST.getRegisterInfo(); | |||
1202 | TII = ST.getInstrInfo(); | |||
1203 | ||||
1204 | // Find all SDWA operands in MF. | |||
1205 | bool Ret = false; | |||
1206 | for (MachineBasicBlock &MBB : MF) { | |||
1207 | bool Changed = false; | |||
1208 | do { | |||
1209 | // Preprocess the ADD/SUB pairs so they could be SDWA'ed. | |||
1210 | // Look for a possible ADD or SUB that resulted from a previously lowered | |||
1211 | // V_{ADD|SUB}_U64_PSEUDO. The function pseudoOpConvertToVOP2 | |||
1212 | // lowers the pair of instructions into e32 form. | |||
1213 | matchSDWAOperands(MBB); | |||
1214 | for (const auto &OperandPair : SDWAOperands) { | |||
1215 | const auto &Operand = OperandPair.second; | |||
1216 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII); | |||
1217 | if (PotentialMI && | |||
1218 | (PotentialMI->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 || | |||
1219 | PotentialMI->getOpcode() == AMDGPU::V_SUB_CO_U32_e64)) | |||
1220 | pseudoOpConvertToVOP2(*PotentialMI, ST); | |||
1221 | } | |||
1222 | SDWAOperands.clear(); | |||
1223 | ||||
1224 | // Generate potential match list. | |||
1225 | matchSDWAOperands(MBB); | |||
1226 | ||||
1227 | for (const auto &OperandPair : SDWAOperands) { | |||
1228 | const auto &Operand = OperandPair.second; | |||
1229 | MachineInstr *PotentialMI = Operand->potentialToConvert(TII); | |||
1230 | if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) { | |||
1231 | PotentialMatches[PotentialMI].push_back(Operand.get()); | |||
1232 | } | |||
1233 | } | |||
1234 | ||||
1235 | for (auto &PotentialPair : PotentialMatches) { | |||
1236 | MachineInstr &PotentialMI = *PotentialPair.first; | |||
1237 | convertToSDWA(PotentialMI, PotentialPair.second); | |||
1238 | } | |||
1239 | ||||
1240 | PotentialMatches.clear(); | |||
1241 | SDWAOperands.clear(); | |||
1242 | ||||
1243 | Changed = !ConvertedInstructions.empty(); | |||
1244 | ||||
1245 | if (Changed) | |||
1246 | Ret = true; | |||
1247 | while (!ConvertedInstructions.empty()) | |||
1248 | legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST); | |||
1249 | } while (Changed); | |||
1250 | } | |||
1251 | ||||
1252 | return Ret; | |||
1253 | } |