LLVM 23.0.0git
SIPreEmitPeephole.cpp
Go to the documentation of this file.
1//===-- SIPreEmitPeephole.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass performs the peephole optimizations before code emission.
11///
12/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
13/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
14/// co-issued. This helps with overlapping MFMA and certain vector instructions
15/// in machine schedules and is expected to improve performance. Only those
16/// packed instructions are unpacked that are overlapped by the MFMA latency.
17/// Rest should remain untouched.
18/// TODO: Add support for F16 packed instructions
19//===----------------------------------------------------------------------===//
20
21#include "AMDGPU.h"
22#include "GCNSubtarget.h"
24#include "llvm/ADT/SetVector.h"
30using namespace llvm;
31
32#define DEBUG_TYPE "si-pre-emit-peephole"
33
34namespace {
35
36class SIPreEmitPeephole {
37private:
38 const SIInstrInfo *TII = nullptr;
39 const SIRegisterInfo *TRI = nullptr;
40
41 bool optimizeVccBranch(MachineInstr &MI) const;
42 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
43 bool getBlockDestinations(MachineBasicBlock &SrcMBB,
44 MachineBasicBlock *&TrueMBB,
45 MachineBasicBlock *&FalseMBB,
47 bool mustRetainExeczBranch(const MachineInstr &Branch,
48 const MachineBasicBlock &From,
49 const MachineBasicBlock &To) const;
50 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
51 // Creates a list of packed instructions following an MFMA that are suitable
52 // for unpacking.
53 void collectUnpackingCandidates(MachineInstr &BeginMI,
54 SetVector<MachineInstr *> &InstrsToUnpack,
55 uint16_t NumMFMACycles);
56 // v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1]
57 // op_sel_hi:[0,0,0]
58 // ==>
59 // v_fma_f32 v0, v1, v3, v3
60 // v_fma_f32 v1, v0, v2, v2
61 // Here, we have overwritten v0 before we use it. This function checks if
62 // unpacking can lead to such a situation.
63 bool canUnpackingClobberRegister(const MachineInstr &MI);
64 // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
65 // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
66 // this transformation.
67 void performF32Unpacking(MachineInstr &I);
68 // Select corresponding unpacked instruction
69 uint16_t mapToUnpackedOpcode(MachineInstr &I);
70 // Creates the unpacked instruction to be inserted. Adds source modifiers to
71 // the unpacked instructions based on the source modifiers in the packed
72 // instruction.
73 MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode,
74 bool IsHiBits);
75 // Process operands/source modifiers from packed instructions and insert the
76 // appropriate source modifers and operands into the unpacked instructions.
77 void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods,
78 bool IsHiBits, const MachineOperand &SrcMO);
79
80public:
81 bool run(MachineFunction &MF);
82};
83
84class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
85public:
86 static char ID;
87
88 SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {
90 }
91
92 bool runOnMachineFunction(MachineFunction &MF) override {
93 return SIPreEmitPeephole().run(MF);
94 }
95};
96
97} // End anonymous namespace.
98
99INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE,
100 "SI peephole optimizations", false, false)
101
102char SIPreEmitPeepholeLegacy::ID = 0;
103
104char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID;
105
106bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
107 // Match:
108 // sreg = -1 or 0
109 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
110 // S_CBRANCH_VCC[N]Z
111 // =>
112 // S_CBRANCH_EXEC[N]Z
113 // We end up with this pattern sometimes after basic block placement.
114 // It happens while combining a block which assigns -1 or 0 to a saved mask
115 // and another block which consumes that saved mask and then a branch.
116 //
117 // While searching this also performs the following substitution:
118 // vcc = V_CMP
119 // vcc = S_AND exec, vcc
120 // S_CBRANCH_VCC[N]Z
121 // =>
122 // vcc = V_CMP
123 // S_CBRANCH_VCC[N]Z
124
125 bool Changed = false;
126 MachineBasicBlock &MBB = *MI.getParent();
127 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
128 const bool IsWave32 = ST.isWave32();
129 const unsigned CondReg = TRI->getVCC();
130 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
131 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
132 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
133 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
134
135 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
136 E = MBB.rend();
137 bool ReadsCond = false;
138 unsigned Threshold = 5;
139 for (++A; A != E; ++A) {
140 if (!--Threshold)
141 return false;
142 if (A->modifiesRegister(ExecReg, TRI))
143 return false;
144 if (A->modifiesRegister(CondReg, TRI)) {
145 if (!A->definesRegister(CondReg, TRI) ||
146 (A->getOpcode() != And && A->getOpcode() != AndN2))
147 return false;
148 break;
149 }
150 ReadsCond |= A->readsRegister(CondReg, TRI);
151 }
152 if (A == E)
153 return false;
154
155 MachineOperand &Op1 = A->getOperand(1);
156 MachineOperand &Op2 = A->getOperand(2);
157 if ((!Op1.isReg() || Op1.getReg() != ExecReg) && Op2.isReg() &&
158 Op2.getReg() == ExecReg) {
159 TII->commuteInstruction(*A);
160 Changed = true;
161 }
162 if (!Op1.isReg() || Op1.getReg() != ExecReg)
163 return Changed;
164 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
165 return Changed;
166
167 int64_t MaskValue = 0;
168 Register SReg;
169 if (Op2.isReg()) {
170 SReg = Op2.getReg();
171 auto M = std::next(A);
172 bool ReadsSreg = false;
173 bool ModifiesExec = false;
174 for (; M != E; ++M) {
175 if (M->definesRegister(SReg, TRI))
176 break;
177 if (M->modifiesRegister(SReg, TRI))
178 return Changed;
179 ReadsSreg |= M->readsRegister(SReg, TRI);
180 ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
181 }
182 if (M == E)
183 return Changed;
184 // If SReg is VCC and SReg definition is a VALU comparison.
185 // This means S_AND with EXEC is not required.
186 // Erase the S_AND and return.
187 // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
188 if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
189 TII->isVOPC(*M)) {
190 A->eraseFromParent();
191 return true;
192 }
193 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
194 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
195 return Changed;
196 MaskValue = M->getOperand(1).getImm();
197 // First if sreg is only used in the AND instruction fold the immediate
198 // into the AND.
199 if (!ReadsSreg && Op2.isKill()) {
200 A->getOperand(2).ChangeToImmediate(MaskValue);
201 M->eraseFromParent();
202 }
203 } else if (Op2.isImm()) {
204 MaskValue = Op2.getImm();
205 } else {
206 llvm_unreachable("Op2 must be register or immediate");
207 }
208
209 // Invert mask for s_andn2
210 assert(MaskValue == 0 || MaskValue == -1);
211 if (A->getOpcode() == AndN2)
212 MaskValue = ~MaskValue;
213
214 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
215 if (!MI.killsRegister(CondReg, TRI)) {
216 // Replace AND with MOV
217 if (MaskValue == 0) {
218 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
219 .addImm(0);
220 } else {
221 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
222 .addReg(ExecReg);
223 }
224 }
225 // Remove AND instruction
226 A->eraseFromParent();
227 }
228
229 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
230 if (SReg == ExecReg) {
231 // EXEC is updated directly
232 if (IsVCCZ) {
233 MI.eraseFromParent();
234 return true;
235 }
236 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
237 } else if (IsVCCZ && MaskValue == 0) {
238 // Will always branch
239 // Remove all successors shadowed by new unconditional branch
240 MachineBasicBlock *Parent = MI.getParent();
242 bool Found = false;
243 for (MachineInstr &Term : Parent->terminators()) {
244 if (Found) {
245 if (Term.isBranch())
246 ToRemove.push_back(&Term);
247 } else {
248 Found = Term.isIdenticalTo(MI);
249 }
250 }
251 assert(Found && "conditional branch is not terminator");
252 for (auto *BranchMI : ToRemove) {
253 MachineOperand &Dst = BranchMI->getOperand(0);
254 assert(Dst.isMBB() && "destination is not basic block");
255 Parent->removeSuccessor(Dst.getMBB());
256 BranchMI->eraseFromParent();
257 }
258
259 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
260 Parent->removeSuccessor(Succ);
261 }
262
263 // Rewrite to unconditional branch
264 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
265 } else if (!IsVCCZ && MaskValue == 0) {
266 // Will never branch
267 MachineOperand &Dst = MI.getOperand(0);
268 assert(Dst.isMBB() && "destination is not basic block");
269 MI.getParent()->removeSuccessor(Dst.getMBB());
270 MI.eraseFromParent();
271 return true;
272 } else if (MaskValue == -1) {
273 // Depends only on EXEC
274 MI.setDesc(
275 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
276 }
277
278 MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/));
279 MI.addImplicitDefUseOperands(*MBB.getParent());
280
281 return true;
282}
283
284bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
285 MachineInstr &MI) const {
286 MachineBasicBlock &MBB = *MI.getParent();
287 const MachineFunction &MF = *MBB.getParent();
288 const MachineRegisterInfo &MRI = MF.getRegInfo();
289 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
290 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
291 SmallVector<MachineInstr *, 4> ToRemove;
292 bool IdxOn = true;
293
294 if (!MI.isIdenticalTo(First))
295 return false;
296
297 // Scan back to find an identical S_SET_GPR_IDX_ON
298 for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
299 E = MI.getIterator();
300 I != E; ++I) {
301 if (I->isBundle() || I->isDebugInstr())
302 continue;
303 switch (I->getOpcode()) {
304 case AMDGPU::S_SET_GPR_IDX_MODE:
305 return false;
306 case AMDGPU::S_SET_GPR_IDX_OFF:
307 IdxOn = false;
308 ToRemove.push_back(&*I);
309 break;
310 default:
311 if (I->modifiesRegister(AMDGPU::M0, TRI))
312 return false;
313 if (IdxReg && I->modifiesRegister(IdxReg, TRI))
314 return false;
315 if (llvm::any_of(I->operands(), [&MRI, this](const MachineOperand &MO) {
316 return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg());
317 })) {
318 // The only exception allowed here is another indirect vector move
319 // with the same mode.
320 if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
321 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
322 return false;
323 }
324 }
325 }
326
327 MI.eraseFromBundle();
328 for (MachineInstr *RI : ToRemove)
329 RI->eraseFromBundle();
330 return true;
331}
332
333bool SIPreEmitPeephole::getBlockDestinations(
334 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
335 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
336 if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
337 return false;
338
339 if (!FalseMBB)
340 FalseMBB = SrcMBB.getNextNode();
341
342 return true;
343}
344
345namespace {
346class BranchWeightCostModel {
347 const SIInstrInfo &TII;
348 const TargetSchedModel &SchedModel;
349 BranchProbability BranchProb;
350 static constexpr uint64_t BranchNotTakenCost = 1;
351 uint64_t BranchTakenCost;
352 uint64_t ThenCyclesCost = 0;
353
354public:
355 BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
356 const MachineBasicBlock &Succ)
357 : TII(TII), SchedModel(TII.getSchedModel()) {
358 const MachineBasicBlock &Head = *Branch.getParent();
359 const auto *FromIt = find(Head.successors(), &Succ);
360 assert(FromIt != Head.succ_end());
361
362 BranchProb = Head.getSuccProbability(FromIt);
363 if (BranchProb.isUnknown())
364 BranchProb = BranchProbability::getZero();
365 BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
366 }
367
368 bool isProfitable(const MachineInstr &MI) {
369 if (TII.isWaitcnt(MI.getOpcode()))
370 return false;
371
372 ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
373
374 // Consider `P = N/D` to be the probability of execz being false (skipping
375 // the then-block) The transformation is profitable if always executing the
376 // 'then' block is cheaper than executing sometimes 'then' and always
377 // executing s_cbranch_execz:
378 // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
379 // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
380 // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
381 // BranchNotTakenCost
382 uint64_t Numerator = BranchProb.getNumerator();
383 uint64_t Denominator = BranchProb.getDenominator();
384 return (Denominator - Numerator) * ThenCyclesCost <=
385 ((Denominator - Numerator) * BranchTakenCost +
386 Numerator * BranchNotTakenCost);
387 }
388};
389
390bool SIPreEmitPeephole::mustRetainExeczBranch(
391 const MachineInstr &Branch, const MachineBasicBlock &From,
392 const MachineBasicBlock &To) const {
393 assert(is_contained(Branch.getParent()->successors(), &From));
394 BranchWeightCostModel CostModel{*TII, Branch, From};
395
396 const MachineFunction *MF = From.getParent();
397 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
398 MBBI != End && MBBI != ToI; ++MBBI) {
399 const MachineBasicBlock &MBB = *MBBI;
400
401 for (const MachineInstr &MI : MBB) {
402 // When a uniform loop is inside non-uniform control flow, the branch
403 // leaving the loop might never be taken when EXEC = 0.
404 // Hence we should retain cbranch out of the loop lest it become infinite.
405 if (MI.isConditionalBranch())
406 return true;
407
408 if (MI.isUnconditionalBranch() &&
409 TII->getBranchDestBlock(MI) != MBB.getNextNode())
410 return true;
411
412 if (MI.isMetaInstruction())
413 continue;
414
415 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
416 return true;
417
418 if (!CostModel.isProfitable(MI))
419 return true;
420 }
421 }
422
423 return false;
424}
425} // namespace
426
427// Returns true if the skip branch instruction is removed.
428bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
429 MachineBasicBlock &SrcMBB) {
430
431 if (!TII->getSchedModel().hasInstrSchedModel())
432 return false;
433
434 MachineBasicBlock *TrueMBB = nullptr;
435 MachineBasicBlock *FalseMBB = nullptr;
437
438 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
439 return false;
440
441 // Consider only the forward branches.
442 if (SrcMBB.getNumber() >= TrueMBB->getNumber())
443 return false;
444
445 // Consider only when it is legal and profitable
446 if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
447 return false;
448
449 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
450 MI.eraseFromParent();
451 SrcMBB.removeSuccessor(TrueMBB);
452
453 return true;
454}
455
456bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
457 unsigned OpCode = MI.getOpcode();
458 Register DstReg = MI.getOperand(0).getReg();
459 // Only the first register in the register pair needs to be checked due to the
460 // unpacking order. Packed instructions are unpacked such that the lower 32
461 // bits (i.e., the first register in the pair) are written first. This can
462 // introduce dependencies if the first register is written in one instruction
463 // and then read as part of the higher 32 bits in the subsequent instruction.
464 // Such scenarios can arise due to specific combinations of op_sel and
465 // op_sel_hi modifiers.
466 Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
467
468 const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
469 if (Src0MO && Src0MO->isReg()) {
470 Register SrcReg0 = Src0MO->getReg();
471 unsigned Src0Mods =
472 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
473 Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
474 ? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
475 : TRI->getSubReg(SrcReg0, AMDGPU::sub0);
476 // Check if the register selected by op_sel_hi is the same as the first
477 // register in the destination register pair.
478 if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
479 return true;
480 }
481
482 const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
483 if (Src1MO && Src1MO->isReg()) {
484 Register SrcReg1 = Src1MO->getReg();
485 unsigned Src1Mods =
486 TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
487 Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
488 ? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
489 : TRI->getSubReg(SrcReg1, AMDGPU::sub0);
490 if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
491 return true;
492 }
493
494 // Applicable for packed instructions with 3 source operands, such as
495 // V_PK_FMA.
496 if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
497 const MachineOperand *Src2MO =
498 TII->getNamedOperand(MI, AMDGPU::OpName::src2);
499 if (Src2MO && Src2MO->isReg()) {
500 Register SrcReg2 = Src2MO->getReg();
501 unsigned Src2Mods =
502 TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
503 Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
504 ? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
505 : TRI->getSubReg(SrcReg2, AMDGPU::sub0);
506 if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
507 return true;
508 }
509 }
510 return false;
511}
512
513uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
514 unsigned Opcode = I.getOpcode();
515 // Use 64 bit encoding to allow use of VOP3 instructions.
516 // VOP3 e64 instructions allow source modifiers
517 // e32 instructions don't allow source modifiers.
518 switch (Opcode) {
519 case AMDGPU::V_PK_ADD_F32:
520 return AMDGPU::V_ADD_F32_e64;
521 case AMDGPU::V_PK_MUL_F32:
522 return AMDGPU::V_MUL_F32_e64;
523 case AMDGPU::V_PK_FMA_F32:
524 return AMDGPU::V_FMA_F32_e64;
525 default:
526 return std::numeric_limits<uint16_t>::max();
527 }
528 llvm_unreachable("Fully covered switch");
529}
530
531void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
532 unsigned SrcMods, bool IsHiBits,
533 const MachineOperand &SrcMO) {
534 unsigned NewSrcMods = 0;
535 unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
536 unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
537 // Packed instructions (VOP3P) do not support ABS. Hence, no checks are done
538 // for ABS modifiers.
539 // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
540 // lane.
541 // NEG_HI shares the same bit position with ABS. But packed instructions do
542 // not support ABS. Therefore, NEG_HI must be translated to NEG source
543 // modifier for the higher 32 bits. Unpacked VOP3 instructions support
544 // ABS, but do not support NEG_HI. Therefore we need to explicitly add the
545 // NEG modifier if present in the packed instruction.
546 if (SrcMods & NegModifier)
547 NewSrcMods |= SISrcMods::NEG;
548 // Src modifiers. Only negative modifiers are added if needed. Unpacked
549 // operations do not have op_sel, therefore it must be handled explicitly as
550 // done below.
551 NewMI.addImm(NewSrcMods);
552 if (SrcMO.isImm()) {
553 NewMI.addImm(SrcMO.getImm());
554 return;
555 }
556 // If op_sel == 0, select register 0 of reg:sub0_sub1.
557 Register UnpackedSrcReg = (SrcMods & OpSelModifier)
558 ? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
559 : TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
560
561 MachineOperand UnpackedSrcMO =
562 MachineOperand::CreateReg(UnpackedSrcReg, /*isDef=*/false);
563 if (SrcMO.isKill()) {
564 // For each unpacked instruction, mark its source registers as killed if the
565 // corresponding source register in the original packed instruction was
566 // marked as killed.
567 //
568 // Exception:
569 // If the op_sel and op_sel_hi modifiers require both unpacked instructions
570 // to use the same register (e.g., due to overlapping access to low/high
571 // bits of the same packed register), then only the *second* (latter)
572 // instruction should mark the register as killed. This is because the
573 // second instruction handles the higher bits and is effectively the last
574 // user of the full register pair.
575
576 bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
577 bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
578 bool KillState = true;
579 if ((OpSel == OpSelHi) && !IsHiBits)
580 KillState = false;
581 UnpackedSrcMO.setIsKill(KillState);
582 }
583 NewMI.add(UnpackedSrcMO);
584}
585
586void SIPreEmitPeephole::collectUnpackingCandidates(
587 MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
588 uint16_t NumMFMACycles) {
589 auto *BB = BeginMI.getParent();
590 auto E = BB->end();
591 int TotalCyclesBetweenCandidates = 0;
592 auto SchedModel = TII->getSchedModel();
593 Register MFMADef = BeginMI.getOperand(0).getReg();
594
595 for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
596 MachineInstr &Instr = *I;
597 uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
598 bool IsUnpackable =
599 !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
600 if (Instr.isMetaInstruction())
601 continue;
602 if ((Instr.isTerminator()) ||
603 (TII->isNeverCoissue(Instr) && !IsUnpackable) ||
605 Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
606 return;
607
608 const MCSchedClassDesc *InstrSchedClassDesc =
609 SchedModel.resolveSchedClass(&Instr);
610 uint16_t Latency =
611 SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
612 TotalCyclesBetweenCandidates += Latency;
613
614 if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
615 return;
616 // Identify register dependencies between those used by the MFMA
617 // instruction and the following packed instructions. Also checks for
618 // transitive dependencies between the MFMA def and candidate instruction
619 // def and uses. Conservatively ensures that we do not incorrectly
620 // read/write registers.
621 for (const MachineOperand &InstrMO : Instr.operands()) {
622 if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
623 continue;
624 if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
625 return;
626 }
627 if (!IsUnpackable)
628 continue;
629
630 if (canUnpackingClobberRegister(Instr))
631 return;
632 // If it's a packed instruction, adjust latency: remove the packed
633 // latency, add latency of two unpacked instructions (currently estimated
634 // as 2 cycles).
635 TotalCyclesBetweenCandidates -= Latency;
636 // TODO: improve latency handling based on instruction modeling.
637 TotalCyclesBetweenCandidates += 2;
638 // Subtract 1 to account for MFMA issue latency.
639 if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
640 InstrsToUnpack.insert(&Instr);
641 }
642}
643
644void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
645 const MachineOperand &DstOp = I.getOperand(0);
646
647 uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
648 assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
649 "Unsupported Opcode");
650
651 MachineInstrBuilder Op0LOp1L =
652 createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
653 MachineOperand LoDstOp = Op0LOp1L->getOperand(0);
654
655 LoDstOp.setIsUndef(DstOp.isUndef());
656
657 MachineInstrBuilder Op0HOp1H =
658 createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
659 MachineOperand HiDstOp = Op0HOp1H->getOperand(0);
660
661 uint32_t IFlags = I.getFlags();
662 Op0LOp1L->setFlags(IFlags);
663 Op0HOp1H->setFlags(IFlags);
664 LoDstOp.setIsRenamable(DstOp.isRenamable());
665 HiDstOp.setIsRenamable(DstOp.isRenamable());
666
667 I.eraseFromParent();
668}
669
670MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
671 uint16_t UnpackedOpcode,
672 bool IsHiBits) {
673 MachineBasicBlock &MBB = *I.getParent();
674 const DebugLoc &DL = I.getDebugLoc();
675 const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
676 const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
677 Register DstReg = I.getOperand(0).getReg();
678 unsigned OpCode = I.getOpcode();
679 Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
680 : TRI->getSubReg(DstReg, AMDGPU::sub0);
681
682 int64_t ClampVal = TII->getNamedOperand(I, AMDGPU::OpName::clamp)->getImm();
683 unsigned Src0Mods =
684 TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm();
685 unsigned Src1Mods =
686 TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm();
687
688 MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
689 NewMI.addDef(UnpackedDstReg); // vdst
690 addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
691 addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
692
693 if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
694 const MachineOperand *SrcMO2 =
695 TII->getNamedOperand(I, AMDGPU::OpName::src2);
696 unsigned Src2Mods =
697 TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
698 addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
699 }
700 NewMI.addImm(ClampVal); // clamp
701 // Packed instructions do not support output modifiers. safe to assign them 0
702 // for this use case
703 NewMI.addImm(0); // omod
704 return NewMI;
705}
706
707PreservedAnalyses
710 auto *MDT = MFAM.getCachedResult<MachineDominatorTreeAnalysis>(MF);
712
713 if (SIPreEmitPeephole().run(MF))
715
716 if (MDT)
717 MDT->updateBlockNumbers();
718 if (MPDT)
719 MPDT->updateBlockNumbers();
720 return PreservedAnalyses::all();
721}
722
723bool SIPreEmitPeephole::run(MachineFunction &MF) {
724 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
725 TII = ST.getInstrInfo();
726 TRI = &TII->getRegisterInfo();
727 bool Changed = false;
728
729 MF.RenumberBlocks();
730
731 for (MachineBasicBlock &MBB : MF) {
732 MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
733 // Check first terminator for branches to optimize
734 if (TermI != MBB.end()) {
735 MachineInstr &MI = *TermI;
736 switch (MI.getOpcode()) {
737 case AMDGPU::S_CBRANCH_VCCZ:
738 case AMDGPU::S_CBRANCH_VCCNZ:
739 Changed |= optimizeVccBranch(MI);
740 break;
741 case AMDGPU::S_CBRANCH_EXECZ:
742 Changed |= removeExeczBranch(MI, MBB);
743 break;
744 }
745 }
746
747 if (!ST.hasVGPRIndexMode())
748 continue;
749
750 MachineInstr *SetGPRMI = nullptr;
751 const unsigned Threshold = 20;
752 unsigned Count = 0;
753 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
754 // second is not needed. Do expensive checks in the optimizeSetGPR()
755 // and limit the distance to 20 instructions for compile time purposes.
756 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
757 // may be bundled with the instructions they modify.
758 for (auto &MI : make_early_inc_range(MBB.instrs())) {
759 if (Count == Threshold)
760 SetGPRMI = nullptr;
761 else
762 ++Count;
763
764 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
765 continue;
766
767 Count = 0;
768 if (!SetGPRMI) {
769 SetGPRMI = &MI;
770 continue;
771 }
772
773 if (optimizeSetGPR(*SetGPRMI, MI))
774 Changed = true;
775 else
776 SetGPRMI = &MI;
777 }
778 }
779
780 // TODO: Fold this into previous block, if possible. Evaluate and handle any
781 // side effects.
782
783 // Perform the extra MF scans only for supported archs
784 if (!ST.hasGFX940Insts())
785 return Changed;
786 for (MachineBasicBlock &MBB : MF) {
787 // Unpack packed instructions overlapped by MFMAs. This allows the
788 // compiler to co-issue unpacked instructions with MFMA
789 auto SchedModel = TII->getSchedModel();
790 SetVector<MachineInstr *> InstrsToUnpack;
791 for (auto &MI : make_early_inc_range(MBB.instrs())) {
793 continue;
794 const MCSchedClassDesc *SchedClassDesc =
795 SchedModel.resolveSchedClass(&MI);
796 uint16_t NumMFMACycles =
797 SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
798 collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
799 }
800 for (MachineInstr *MI : InstrsToUnpack) {
801 performF32Unpacking(*MI);
802 }
803 }
804
805 return Changed;
806}
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
const SmallVectorImpl< MachineOperand > & Cond
This file implements a set that has insertion order iteration characteristics.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
#define LLVM_DEBUG(...)
Definition Debug.h:114
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
static uint32_t getDenominator()
uint32_t getNumerator() const
static BranchProbability getZero()
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
LLVM_ABI BranchProbability getSuccProbability(const_succ_iterator Succ) const
Return probability of the edge from this block to MBB.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
BasicBlockListType::const_iterator const_iterator
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
void setFlags(unsigned flags)
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
LLVM_ABI bool isRenamable() const
isRenamable - Returns true if this register may be renamed, i.e.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static bool isMFMA(const MachineInstr &MI)
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
A vector that has set insertion semantics.
Definition SetVector.h:57
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
LLVM_ABI const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
char & SIPreEmitPeepholeID
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
void initializeSIPreEmitPeepholeLegacyPass(PassRegistry &)
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...
Definition MCSchedule.h:73
Matching combinators.