LLVM 22.0.0git
SIPreEmitPeephole.cpp
Go to the documentation of this file.
1//===-- SIPreEmitPeephole.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass performs the peephole optimizations before code emission.
11///
12/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
13/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
14/// co-issued. This helps with overlapping MFMA and certain vector instructions
15/// in machine schedules and is expected to improve performance. Only those
16/// packed instructions are unpacked that are overlapped by the MFMA latency.
17/// Rest should remain untouched.
18/// TODO: Add support for F16 packed instructions
19//===----------------------------------------------------------------------===//
20
21#include "AMDGPU.h"
22#include "GCNSubtarget.h"
24#include "llvm/ADT/SetVector.h"
28
29using namespace llvm;
30
31#define DEBUG_TYPE "si-pre-emit-peephole"
32
33namespace {
34
35class SIPreEmitPeephole {
36private:
37 const SIInstrInfo *TII = nullptr;
38 const SIRegisterInfo *TRI = nullptr;
39
40 bool optimizeVccBranch(MachineInstr &MI) const;
41 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
42 bool getBlockDestinations(MachineBasicBlock &SrcMBB,
43 MachineBasicBlock *&TrueMBB,
44 MachineBasicBlock *&FalseMBB,
46 bool mustRetainExeczBranch(const MachineInstr &Branch,
47 const MachineBasicBlock &From,
48 const MachineBasicBlock &To) const;
49 bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
50 // Creates a list of packed instructions following an MFMA that are suitable
51 // for unpacking.
52 void collectUnpackingCandidates(MachineInstr &BeginMI,
53 SetVector<MachineInstr *> &InstrsToUnpack,
54 uint16_t NumMFMACycles);
55 // v_pk_fma_f32 v[0:1], v[0:1], v[2:3], v[2:3] op_sel:[1,1,1]
56 // op_sel_hi:[0,0,0]
57 // ==>
58 // v_fma_f32 v0, v1, v3, v3
59 // v_fma_f32 v1, v0, v2, v2
60 // Here, we have overwritten v0 before we use it. This function checks if
61 // unpacking can lead to such a situation.
62 bool canUnpackingClobberRegister(const MachineInstr &MI);
63 // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
64 // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
65 // this transformation.
66 void performF32Unpacking(MachineInstr &I);
67 // Select corresponding unpacked instruction
68 uint16_t mapToUnpackedOpcode(MachineInstr &I);
69 // Creates the unpacked instruction to be inserted. Adds source modifiers to
70 // the unpacked instructions based on the source modifiers in the packed
71 // instruction.
72 MachineInstrBuilder createUnpackedMI(MachineInstr &I, uint16_t UnpackedOpcode,
73 bool IsHiBits);
74 // Process operands/source modifiers from packed instructions and insert the
75 // appropriate source modifers and operands into the unpacked instructions.
76 void addOperandAndMods(MachineInstrBuilder &NewMI, unsigned SrcMods,
77 bool IsHiBits, const MachineOperand &SrcMO);
78
79public:
80 bool run(MachineFunction &MF);
81};
82
83class SIPreEmitPeepholeLegacy : public MachineFunctionPass {
84public:
85 static char ID;
86
87 SIPreEmitPeepholeLegacy() : MachineFunctionPass(ID) {
89 }
90
91 bool runOnMachineFunction(MachineFunction &MF) override {
92 return SIPreEmitPeephole().run(MF);
93 }
94};
95
96} // End anonymous namespace.
97
98INITIALIZE_PASS(SIPreEmitPeepholeLegacy, DEBUG_TYPE,
99 "SI peephole optimizations", false, false)
100
101char SIPreEmitPeepholeLegacy::ID = 0;
102
103char &llvm::SIPreEmitPeepholeID = SIPreEmitPeepholeLegacy::ID;
104
105bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
106 // Match:
107 // sreg = -1 or 0
108 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
109 // S_CBRANCH_VCC[N]Z
110 // =>
111 // S_CBRANCH_EXEC[N]Z
112 // We end up with this pattern sometimes after basic block placement.
113 // It happens while combining a block which assigns -1 or 0 to a saved mask
114 // and another block which consumes that saved mask and then a branch.
115 //
116 // While searching this also performs the following substitution:
117 // vcc = V_CMP
118 // vcc = S_AND exec, vcc
119 // S_CBRANCH_VCC[N]Z
120 // =>
121 // vcc = V_CMP
122 // S_CBRANCH_VCC[N]Z
123
124 bool Changed = false;
125 MachineBasicBlock &MBB = *MI.getParent();
126 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
127 const bool IsWave32 = ST.isWave32();
128 const unsigned CondReg = TRI->getVCC();
129 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
130 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
131 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
132 const unsigned Mov = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
133
134 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
135 E = MBB.rend();
136 bool ReadsCond = false;
137 unsigned Threshold = 5;
138 for (++A; A != E; ++A) {
139 if (!--Threshold)
140 return false;
141 if (A->modifiesRegister(ExecReg, TRI))
142 return false;
143 if (A->modifiesRegister(CondReg, TRI)) {
144 if (!A->definesRegister(CondReg, TRI) ||
145 (A->getOpcode() != And && A->getOpcode() != AndN2))
146 return false;
147 break;
148 }
149 ReadsCond |= A->readsRegister(CondReg, TRI);
150 }
151 if (A == E)
152 return false;
153
154 MachineOperand &Op1 = A->getOperand(1);
155 MachineOperand &Op2 = A->getOperand(2);
156 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
157 TII->commuteInstruction(*A);
158 Changed = true;
159 }
160 if (Op1.getReg() != ExecReg)
161 return Changed;
162 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
163 return Changed;
164
165 int64_t MaskValue = 0;
166 Register SReg;
167 if (Op2.isReg()) {
168 SReg = Op2.getReg();
169 auto M = std::next(A);
170 bool ReadsSreg = false;
171 bool ModifiesExec = false;
172 for (; M != E; ++M) {
173 if (M->definesRegister(SReg, TRI))
174 break;
175 if (M->modifiesRegister(SReg, TRI))
176 return Changed;
177 ReadsSreg |= M->readsRegister(SReg, TRI);
178 ModifiesExec |= M->modifiesRegister(ExecReg, TRI);
179 }
180 if (M == E)
181 return Changed;
182 // If SReg is VCC and SReg definition is a VALU comparison.
183 // This means S_AND with EXEC is not required.
184 // Erase the S_AND and return.
185 // Note: isVOPC is used instead of isCompare to catch V_CMP_CLASS
186 if (A->getOpcode() == And && SReg == CondReg && !ModifiesExec &&
187 TII->isVOPC(*M)) {
188 A->eraseFromParent();
189 return true;
190 }
191 if (!M->isMoveImmediate() || !M->getOperand(1).isImm() ||
192 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
193 return Changed;
194 MaskValue = M->getOperand(1).getImm();
195 // First if sreg is only used in the AND instruction fold the immediate
196 // into the AND.
197 if (!ReadsSreg && Op2.isKill()) {
198 A->getOperand(2).ChangeToImmediate(MaskValue);
199 M->eraseFromParent();
200 }
201 } else if (Op2.isImm()) {
202 MaskValue = Op2.getImm();
203 } else {
204 llvm_unreachable("Op2 must be register or immediate");
205 }
206
207 // Invert mask for s_andn2
208 assert(MaskValue == 0 || MaskValue == -1);
209 if (A->getOpcode() == AndN2)
210 MaskValue = ~MaskValue;
211
212 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC, /*TRI=*/nullptr)) {
213 if (!MI.killsRegister(CondReg, TRI)) {
214 // Replace AND with MOV
215 if (MaskValue == 0) {
216 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
217 .addImm(0);
218 } else {
219 BuildMI(*A->getParent(), *A, A->getDebugLoc(), TII->get(Mov), CondReg)
220 .addReg(ExecReg);
221 }
222 }
223 // Remove AND instruction
224 A->eraseFromParent();
225 }
226
227 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
228 if (SReg == ExecReg) {
229 // EXEC is updated directly
230 if (IsVCCZ) {
231 MI.eraseFromParent();
232 return true;
233 }
234 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
235 } else if (IsVCCZ && MaskValue == 0) {
236 // Will always branch
237 // Remove all successors shadowed by new unconditional branch
238 MachineBasicBlock *Parent = MI.getParent();
240 bool Found = false;
241 for (MachineInstr &Term : Parent->terminators()) {
242 if (Found) {
243 if (Term.isBranch())
244 ToRemove.push_back(&Term);
245 } else {
246 Found = Term.isIdenticalTo(MI);
247 }
248 }
249 assert(Found && "conditional branch is not terminator");
250 for (auto *BranchMI : ToRemove) {
251 MachineOperand &Dst = BranchMI->getOperand(0);
252 assert(Dst.isMBB() && "destination is not basic block");
253 Parent->removeSuccessor(Dst.getMBB());
254 BranchMI->eraseFromParent();
255 }
256
257 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
258 Parent->removeSuccessor(Succ);
259 }
260
261 // Rewrite to unconditional branch
262 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
263 } else if (!IsVCCZ && MaskValue == 0) {
264 // Will never branch
265 MachineOperand &Dst = MI.getOperand(0);
266 assert(Dst.isMBB() && "destination is not basic block");
267 MI.getParent()->removeSuccessor(Dst.getMBB());
268 MI.eraseFromParent();
269 return true;
270 } else if (MaskValue == -1) {
271 // Depends only on EXEC
272 MI.setDesc(
273 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
274 }
275
276 MI.removeOperand(MI.findRegisterUseOperandIdx(CondReg, TRI, false /*Kill*/));
277 MI.addImplicitDefUseOperands(*MBB.getParent());
278
279 return true;
280}
281
282bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
283 MachineInstr &MI) const {
284 MachineBasicBlock &MBB = *MI.getParent();
285 const MachineFunction &MF = *MBB.getParent();
286 const MachineRegisterInfo &MRI = MF.getRegInfo();
287 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
288 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
289 SmallVector<MachineInstr *, 4> ToRemove;
290 bool IdxOn = true;
291
292 if (!MI.isIdenticalTo(First))
293 return false;
294
295 // Scan back to find an identical S_SET_GPR_IDX_ON
296 for (MachineBasicBlock::instr_iterator I = std::next(First.getIterator()),
297 E = MI.getIterator();
298 I != E; ++I) {
299 if (I->isBundle())
300 continue;
301 switch (I->getOpcode()) {
302 case AMDGPU::S_SET_GPR_IDX_MODE:
303 return false;
304 case AMDGPU::S_SET_GPR_IDX_OFF:
305 IdxOn = false;
306 ToRemove.push_back(&*I);
307 break;
308 default:
309 if (I->modifiesRegister(AMDGPU::M0, TRI))
310 return false;
311 if (IdxReg && I->modifiesRegister(IdxReg, TRI))
312 return false;
313 if (llvm::any_of(I->operands(), [&MRI, this](const MachineOperand &MO) {
314 return MO.isReg() && TRI->isVectorRegister(MRI, MO.getReg());
315 })) {
316 // The only exception allowed here is another indirect vector move
317 // with the same mode.
318 if (!IdxOn || !(I->getOpcode() == AMDGPU::V_MOV_B32_indirect_write ||
319 I->getOpcode() == AMDGPU::V_MOV_B32_indirect_read))
320 return false;
321 }
322 }
323 }
324
325 MI.eraseFromBundle();
326 for (MachineInstr *RI : ToRemove)
327 RI->eraseFromBundle();
328 return true;
329}
330
331bool SIPreEmitPeephole::getBlockDestinations(
332 MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
333 MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
334 if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
335 return false;
336
337 if (!FalseMBB)
338 FalseMBB = SrcMBB.getNextNode();
339
340 return true;
341}
342
343namespace {
344class BranchWeightCostModel {
345 const SIInstrInfo &TII;
346 const TargetSchedModel &SchedModel;
347 BranchProbability BranchProb;
348 static constexpr uint64_t BranchNotTakenCost = 1;
349 uint64_t BranchTakenCost;
350 uint64_t ThenCyclesCost = 0;
351
352public:
353 BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
354 const MachineBasicBlock &Succ)
355 : TII(TII), SchedModel(TII.getSchedModel()) {
356 const MachineBasicBlock &Head = *Branch.getParent();
357 const auto *FromIt = find(Head.successors(), &Succ);
358 assert(FromIt != Head.succ_end());
359
360 BranchProb = Head.getSuccProbability(FromIt);
361 if (BranchProb.isUnknown())
362 BranchProb = BranchProbability::getZero();
363 BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
364 }
365
366 bool isProfitable(const MachineInstr &MI) {
367 if (TII.isWaitcnt(MI.getOpcode()))
368 return false;
369
370 ThenCyclesCost += SchedModel.computeInstrLatency(&MI);
371
372 // Consider `P = N/D` to be the probability of execz being false (skipping
373 // the then-block) The transformation is profitable if always executing the
374 // 'then' block is cheaper than executing sometimes 'then' and always
375 // executing s_cbranch_execz:
376 // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
377 // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
378 // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
379 // BranchNotTakenCost
380 uint64_t Numerator = BranchProb.getNumerator();
381 uint64_t Denominator = BranchProb.getDenominator();
382 return (Denominator - Numerator) * ThenCyclesCost <=
383 ((Denominator - Numerator) * BranchTakenCost +
384 Numerator * BranchNotTakenCost);
385 }
386};
387
388bool SIPreEmitPeephole::mustRetainExeczBranch(
389 const MachineInstr &Branch, const MachineBasicBlock &From,
390 const MachineBasicBlock &To) const {
391 assert(is_contained(Branch.getParent()->successors(), &From));
392 BranchWeightCostModel CostModel{*TII, Branch, From};
393
394 const MachineFunction *MF = From.getParent();
395 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
396 MBBI != End && MBBI != ToI; ++MBBI) {
397 const MachineBasicBlock &MBB = *MBBI;
398
399 for (const MachineInstr &MI : MBB) {
400 // When a uniform loop is inside non-uniform control flow, the branch
401 // leaving the loop might never be taken when EXEC = 0.
402 // Hence we should retain cbranch out of the loop lest it become infinite.
403 if (MI.isConditionalBranch())
404 return true;
405
406 if (MI.isUnconditionalBranch() &&
407 TII->getBranchDestBlock(MI) != MBB.getNextNode())
408 return true;
409
410 if (MI.isMetaInstruction())
411 continue;
412
413 if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
414 return true;
415
416 if (!CostModel.isProfitable(MI))
417 return true;
418 }
419 }
420
421 return false;
422}
423} // namespace
424
425// Returns true if the skip branch instruction is removed.
426bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
427 MachineBasicBlock &SrcMBB) {
428
429 if (!TII->getSchedModel().hasInstrSchedModel())
430 return false;
431
432 MachineBasicBlock *TrueMBB = nullptr;
433 MachineBasicBlock *FalseMBB = nullptr;
435
436 if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
437 return false;
438
439 // Consider only the forward branches.
440 if (SrcMBB.getNumber() >= TrueMBB->getNumber())
441 return false;
442
443 // Consider only when it is legal and profitable
444 if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
445 return false;
446
447 LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
448 MI.eraseFromParent();
449 SrcMBB.removeSuccessor(TrueMBB);
450
451 return true;
452}
453
454bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
455 unsigned OpCode = MI.getOpcode();
456 Register DstReg = MI.getOperand(0).getReg();
457 // Only the first register in the register pair needs to be checked due to the
458 // unpacking order. Packed instructions are unpacked such that the lower 32
459 // bits (i.e., the first register in the pair) are written first. This can
460 // introduce dependencies if the first register is written in one instruction
461 // and then read as part of the higher 32 bits in the subsequent instruction.
462 // Such scenarios can arise due to specific combinations of op_sel and
463 // op_sel_hi modifiers.
464 Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
465
466 const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
467 if (Src0MO && Src0MO->isReg()) {
468 Register SrcReg0 = Src0MO->getReg();
469 unsigned Src0Mods =
470 TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
471 Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
472 ? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
473 : TRI->getSubReg(SrcReg0, AMDGPU::sub0);
474 // Check if the register selected by op_sel_hi is the same as the first
475 // register in the destination register pair.
476 if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
477 return true;
478 }
479
480 const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
481 if (Src1MO && Src1MO->isReg()) {
482 Register SrcReg1 = Src1MO->getReg();
483 unsigned Src1Mods =
484 TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm();
485 Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
486 ? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
487 : TRI->getSubReg(SrcReg1, AMDGPU::sub0);
488 if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
489 return true;
490 }
491
492 // Applicable for packed instructions with 3 source operands, such as
493 // V_PK_FMA.
494 if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
495 const MachineOperand *Src2MO =
496 TII->getNamedOperand(MI, AMDGPU::OpName::src2);
497 if (Src2MO && Src2MO->isReg()) {
498 Register SrcReg2 = Src2MO->getReg();
499 unsigned Src2Mods =
500 TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
501 Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
502 ? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
503 : TRI->getSubReg(SrcReg2, AMDGPU::sub0);
504 if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
505 return true;
506 }
507 }
508 return false;
509}
510
511uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
512 unsigned Opcode = I.getOpcode();
513 // Use 64 bit encoding to allow use of VOP3 instructions.
514 // VOP3 e64 instructions allow source modifiers
515 // e32 instructions don't allow source modifiers.
516 switch (Opcode) {
517 case AMDGPU::V_PK_ADD_F32:
518 return AMDGPU::V_ADD_F32_e64;
519 case AMDGPU::V_PK_MUL_F32:
520 return AMDGPU::V_MUL_F32_e64;
521 case AMDGPU::V_PK_FMA_F32:
522 return AMDGPU::V_FMA_F32_e64;
523 default:
524 return std::numeric_limits<uint16_t>::max();
525 }
526 llvm_unreachable("Fully covered switch");
527}
528
529void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
530 unsigned SrcMods, bool IsHiBits,
531 const MachineOperand &SrcMO) {
532 unsigned NewSrcMods = 0;
533 unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
534 unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
535 // Packed instructions (VOP3P) do not support ABS. Hence, no checks are done
536 // for ABS modifiers.
537 // If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
538 // lane.
539 // NEG_HI shares the same bit position with ABS. But packed instructions do
540 // not support ABS. Therefore, NEG_HI must be translated to NEG source
541 // modifier for the higher 32 bits. Unpacked VOP3 instructions support
542 // ABS, but do not support NEG_HI. Therefore we need to explicitly add the
543 // NEG modifier if present in the packed instruction.
544 if (SrcMods & NegModifier)
545 NewSrcMods |= SISrcMods::NEG;
546 // Src modifiers. Only negative modifiers are added if needed. Unpacked
547 // operations do not have op_sel, therefore it must be handled explicitly as
548 // done below.
549 NewMI.addImm(NewSrcMods);
550 if (SrcMO.isImm()) {
551 NewMI.addImm(SrcMO.getImm());
552 return;
553 }
554 // If op_sel == 0, select register 0 of reg:sub0_sub1.
555 Register UnpackedSrcReg = (SrcMods & OpSelModifier)
556 ? TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub1)
557 : TRI->getSubReg(SrcMO.getReg(), AMDGPU::sub0);
558
559 MachineOperand UnpackedSrcMO =
560 MachineOperand::CreateReg(UnpackedSrcReg, /*isDef=*/false);
561 if (SrcMO.isKill()) {
562 // For each unpacked instruction, mark its source registers as killed if the
563 // corresponding source register in the original packed instruction was
564 // marked as killed.
565 //
566 // Exception:
567 // If the op_sel and op_sel_hi modifiers require both unpacked instructions
568 // to use the same register (e.g., due to overlapping access to low/high
569 // bits of the same packed register), then only the *second* (latter)
570 // instruction should mark the register as killed. This is because the
571 // second instruction handles the higher bits and is effectively the last
572 // user of the full register pair.
573
574 bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
575 bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
576 bool KillState = true;
577 if ((OpSel == OpSelHi) && !IsHiBits)
578 KillState = false;
579 UnpackedSrcMO.setIsKill(KillState);
580 }
581 NewMI.add(UnpackedSrcMO);
582}
583
584void SIPreEmitPeephole::collectUnpackingCandidates(
585 MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
586 uint16_t NumMFMACycles) {
587 auto *BB = BeginMI.getParent();
588 auto E = BB->end();
589 int TotalCyclesBetweenCandidates = 0;
590 auto SchedModel = TII->getSchedModel();
591 Register MFMADef = BeginMI.getOperand(0).getReg();
592
593 for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
594 MachineInstr &Instr = *I;
595 uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
596 bool IsUnpackable =
597 !(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
598 if (Instr.isMetaInstruction())
599 continue;
600 if ((Instr.isTerminator()) ||
601 (TII->isNeverCoissue(Instr) && !IsUnpackable) ||
603 Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
604 return;
605
606 const MCSchedClassDesc *InstrSchedClassDesc =
607 SchedModel.resolveSchedClass(&Instr);
608 uint16_t Latency =
609 SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
610 TotalCyclesBetweenCandidates += Latency;
611
612 if (TotalCyclesBetweenCandidates >= NumMFMACycles - 1)
613 return;
614 // Identify register dependencies between those used by the MFMA
615 // instruction and the following packed instructions. Also checks for
616 // transitive dependencies between the MFMA def and candidate instruction
617 // def and uses. Conservatively ensures that we do not incorrectly
618 // read/write registers.
619 for (const MachineOperand &InstrMO : Instr.operands()) {
620 if (!InstrMO.isReg() || !InstrMO.getReg().isValid())
621 continue;
622 if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
623 return;
624 }
625 if (!IsUnpackable)
626 continue;
627
628 if (canUnpackingClobberRegister(Instr))
629 return;
630 // If it's a packed instruction, adjust latency: remove the packed
631 // latency, add latency of two unpacked instructions (currently estimated
632 // as 2 cycles).
633 TotalCyclesBetweenCandidates -= Latency;
634 // TODO: improve latency handling based on instruction modeling.
635 TotalCyclesBetweenCandidates += 2;
636 // Subtract 1 to account for MFMA issue latency.
637 if (TotalCyclesBetweenCandidates < NumMFMACycles - 1)
638 InstrsToUnpack.insert(&Instr);
639 }
640}
641
642void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
643 MachineOperand DstOp = I.getOperand(0);
644
645 uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
646 assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
647 "Unsupported Opcode");
648
649 MachineInstrBuilder Op0LOp1L =
650 createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
651 MachineOperand LoDstOp = Op0LOp1L->getOperand(0);
652
653 LoDstOp.setIsUndef(DstOp.isUndef());
654
655 MachineInstrBuilder Op0HOp1H =
656 createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/true);
657 MachineOperand HiDstOp = Op0HOp1H->getOperand(0);
658
659 uint32_t IFlags = I.getFlags();
660 Op0LOp1L->setFlags(IFlags);
661 Op0HOp1H->setFlags(IFlags);
662 LoDstOp.setIsRenamable(DstOp.isRenamable());
663 HiDstOp.setIsRenamable(DstOp.isRenamable());
664
665 I.eraseFromParent();
666}
667
668MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
669 uint16_t UnpackedOpcode,
670 bool IsHiBits) {
671 MachineBasicBlock &MBB = *I.getParent();
672 const DebugLoc &DL = I.getDebugLoc();
673 const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
674 const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
675 Register DstReg = I.getOperand(0).getReg();
676 unsigned OpCode = I.getOpcode();
677 Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
678 : TRI->getSubReg(DstReg, AMDGPU::sub0);
679
680 int64_t ClampVal = TII->getNamedOperand(I, AMDGPU::OpName::clamp)->getImm();
681 unsigned Src0Mods =
682 TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm();
683 unsigned Src1Mods =
684 TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm();
685
686 MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
687 NewMI.addDef(UnpackedDstReg); // vdst
688 addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
689 addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
690
691 if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
692 const MachineOperand *SrcMO2 =
693 TII->getNamedOperand(I, AMDGPU::OpName::src2);
694 unsigned Src2Mods =
695 TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
696 addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
697 }
698 NewMI.addImm(ClampVal); // clamp
699 // Packed instructions do not support output modifiers. safe to assign them 0
700 // for this use case
701 NewMI.addImm(0); // omod
702 return NewMI;
703}
704
705PreservedAnalyses
713
714bool SIPreEmitPeephole::run(MachineFunction &MF) {
715 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
716 TII = ST.getInstrInfo();
717 TRI = &TII->getRegisterInfo();
718 bool Changed = false;
719
720 MF.RenumberBlocks();
721
722 for (MachineBasicBlock &MBB : MF) {
723 MachineBasicBlock::iterator TermI = MBB.getFirstTerminator();
724 // Check first terminator for branches to optimize
725 if (TermI != MBB.end()) {
726 MachineInstr &MI = *TermI;
727 switch (MI.getOpcode()) {
728 case AMDGPU::S_CBRANCH_VCCZ:
729 case AMDGPU::S_CBRANCH_VCCNZ:
730 Changed |= optimizeVccBranch(MI);
731 break;
732 case AMDGPU::S_CBRANCH_EXECZ:
733 Changed |= removeExeczBranch(MI, MBB);
734 break;
735 }
736 }
737
738 if (!ST.hasVGPRIndexMode())
739 continue;
740
741 MachineInstr *SetGPRMI = nullptr;
742 const unsigned Threshold = 20;
743 unsigned Count = 0;
744 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
745 // second is not needed. Do expensive checks in the optimizeSetGPR()
746 // and limit the distance to 20 instructions for compile time purposes.
747 // Note: this needs to work on bundles as S_SET_GPR_IDX* instructions
748 // may be bundled with the instructions they modify.
749 for (auto &MI : make_early_inc_range(MBB.instrs())) {
750 if (Count == Threshold)
751 SetGPRMI = nullptr;
752 else
753 ++Count;
754
755 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
756 continue;
757
758 Count = 0;
759 if (!SetGPRMI) {
760 SetGPRMI = &MI;
761 continue;
762 }
763
764 if (optimizeSetGPR(*SetGPRMI, MI))
765 Changed = true;
766 else
767 SetGPRMI = &MI;
768 }
769 }
770
771 // TODO: Fold this into previous block, if possible. Evaluate and handle any
772 // side effects.
773
774 // Perform the extra MF scans only for supported archs
775 if (!ST.hasGFX940Insts())
776 return Changed;
777 for (MachineBasicBlock &MBB : MF) {
778 // Unpack packed instructions overlapped by MFMAs. This allows the
779 // compiler to co-issue unpacked instructions with MFMA
780 auto SchedModel = TII->getSchedModel();
781 SetVector<MachineInstr *> InstrsToUnpack;
782 for (auto &MI : make_early_inc_range(MBB.instrs())) {
784 continue;
785 const MCSchedClassDesc *SchedClassDesc =
786 SchedModel.resolveSchedClass(&MI);
787 uint16_t NumMFMACycles =
788 SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
789 collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
790 }
791 for (MachineInstr *MI : InstrsToUnpack) {
792 performF32Unpacking(*MI);
793 }
794 }
795
796 return Changed;
797}
unsigned const MachineRegisterInfo * MRI
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:58
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
if(PassOpts->AAPipeline)
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
const SmallVectorImpl< MachineOperand > & Cond
This file implements a set that has insertion order iteration characteristics.
static bool isProfitable(const StableFunctionMap::StableFunctionEntries &SFS)
#define LLVM_DEBUG(...)
Definition Debug.h:114
static uint32_t getDenominator()
uint32_t getNumerator() const
static BranchProbability getZero()
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const override
Analyze the branching code at the end of MBB, returning true if it cannot be understood (e....
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
LLVM_ABI BranchProbability getSuccProbability(const_succ_iterator Succ) const
Return probability of the edge from this block to MBB.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
LLVM_ABI void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
void RenumberBlocks(MachineBasicBlock *MBBFrom=nullptr)
RenumberBlocks - This discards all of the MachineBasicBlock numbers and recomputes them.
BasicBlockListType::const_iterator const_iterator
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
const MachineBasicBlock * getParent() const
void setFlags(unsigned flags)
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void setIsKill(bool Val=true)
LLVM_ABI bool isRenamable() const
isRenamable - Returns true if this register may be renamed, i.e.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
static LLVM_ABI PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
Wrapper class representing virtual and physical registers.
Definition Register.h:19
static bool isMFMA(const MachineInstr &MI)
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
A vector that has set insertion semantics.
Definition SetVector.h:59
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:150
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
LLVM_ABI const MCSchedClassDesc * resolveSchedClass(const MachineInstr *MI) const
Return the MCSchedClassDesc for this instruction.
ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
NodeAddr< InstrNode * > Instr
Definition RDFGraph.h:389
This is an optimization pass for GlobalISel generic memory operations.
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
char & SIPreEmitPeepholeID
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:71
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
void initializeSIPreEmitPeepholeLegacyPass(PassRegistry &)
uint16_t ReleaseAtCycle
Cycle at which the resource will be released by an instruction, relatively to the cycle in which the ...
Definition MCSchedule.h:73
Matching combinators.