24#define DEBUG_TYPE "si-optimize-exec-masking"
28class SIOptimizeExecMasking {
58 bool IgnoreStart =
false)
const;
65 unsigned MaxInstructions = 20)
const;
66 bool optimizeExecSequence();
68 bool optimizeVCMPSaveExecSequence(
MachineInstr &SaveExecInstr,
72 bool optimizeOrSaveexecXorSequences();
84 return "SI optimize exec mask operations";
98 SIOptimizeExecMasking Impl(&MF);
109 "SI optimize exec mask operations",
false,
false)
114char SIOptimizeExecMaskingLegacy::
ID = 0;
120 switch (
MI.getOpcode()) {
122 case AMDGPU::S_MOV_B64:
123 case AMDGPU::S_MOV_B64_term:
124 case AMDGPU::S_MOV_B32:
125 case AMDGPU::S_MOV_B32_term: {
126 const MachineOperand &Src = MI.getOperand(1);
127 if (Src.isReg() && Src.getReg() == LMC.ExecReg)
128 return MI.getOperand(0).getReg();
132 return AMDGPU::NoRegister;
137 switch (
MI.getOpcode()) {
139 case AMDGPU::S_MOV_B64:
140 case AMDGPU::S_MOV_B32: {
142 if (Dst.isReg() && Dst.getReg() == LMC.ExecReg &&
MI.getOperand(1).isReg())
143 return MI.getOperand(1).getReg();
146 case AMDGPU::S_MOV_B64_term:
147 case AMDGPU::S_MOV_B32_term:
157 switch (
MI.getOpcode()) {
158 case AMDGPU::S_AND_B64:
159 case AMDGPU::S_OR_B64:
160 case AMDGPU::S_XOR_B64:
161 case AMDGPU::S_ANDN2_B64:
162 case AMDGPU::S_ORN2_B64:
163 case AMDGPU::S_NAND_B64:
164 case AMDGPU::S_NOR_B64:
165 case AMDGPU::S_XNOR_B64: {
168 return MI.getOperand(0).getReg();
171 return MI.getOperand(0).getReg();
174 case AMDGPU::S_AND_B32:
175 case AMDGPU::S_OR_B32:
176 case AMDGPU::S_XOR_B32:
177 case AMDGPU::S_ANDN2_B32:
178 case AMDGPU::S_ORN2_B32:
179 case AMDGPU::S_NAND_B32:
180 case AMDGPU::S_NOR_B32:
181 case AMDGPU::S_XNOR_B32: {
183 if (Src1.
isReg() && Src1.
getReg() == AMDGPU::EXEC_LO)
184 return MI.getOperand(0).getReg();
186 if (Src2.
isReg() && Src2.
getReg() == AMDGPU::EXEC_LO)
187 return MI.getOperand(0).getReg();
192 return AMDGPU::NoRegister;
197 case AMDGPU::S_AND_B64:
198 return AMDGPU::S_AND_SAVEEXEC_B64;
199 case AMDGPU::S_OR_B64:
200 return AMDGPU::S_OR_SAVEEXEC_B64;
201 case AMDGPU::S_XOR_B64:
202 return AMDGPU::S_XOR_SAVEEXEC_B64;
203 case AMDGPU::S_ANDN2_B64:
204 return AMDGPU::S_ANDN2_SAVEEXEC_B64;
205 case AMDGPU::S_ORN2_B64:
206 return AMDGPU::S_ORN2_SAVEEXEC_B64;
207 case AMDGPU::S_NAND_B64:
208 return AMDGPU::S_NAND_SAVEEXEC_B64;
209 case AMDGPU::S_NOR_B64:
210 return AMDGPU::S_NOR_SAVEEXEC_B64;
211 case AMDGPU::S_XNOR_B64:
212 return AMDGPU::S_XNOR_SAVEEXEC_B64;
213 case AMDGPU::S_AND_B32:
214 return AMDGPU::S_AND_SAVEEXEC_B32;
215 case AMDGPU::S_OR_B32:
216 return AMDGPU::S_OR_SAVEEXEC_B32;
217 case AMDGPU::S_XOR_B32:
218 return AMDGPU::S_XOR_SAVEEXEC_B32;
219 case AMDGPU::S_ANDN2_B32:
220 return AMDGPU::S_ANDN2_SAVEEXEC_B32;
221 case AMDGPU::S_ORN2_B32:
222 return AMDGPU::S_ORN2_SAVEEXEC_B32;
223 case AMDGPU::S_NAND_B32:
224 return AMDGPU::S_NAND_SAVEEXEC_B32;
225 case AMDGPU::S_NOR_B32:
226 return AMDGPU::S_NOR_SAVEEXEC_B32;
227 case AMDGPU::S_XNOR_B32:
228 return AMDGPU::S_XNOR_SAVEEXEC_B32;
230 return AMDGPU::INSTRUCTION_LIST_END;
236bool SIOptimizeExecMasking::removeTerminatorBit(
MachineInstr &
MI)
const {
237 switch (
MI.getOpcode()) {
238 case AMDGPU::S_MOV_B32_term: {
239 bool RegSrc =
MI.getOperand(1).isReg();
240 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
243 case AMDGPU::S_MOV_B64_term: {
244 bool RegSrc =
MI.getOperand(1).isReg();
245 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
248 case AMDGPU::S_XOR_B64_term: {
251 MI.setDesc(
TII->get(AMDGPU::S_XOR_B64));
254 case AMDGPU::S_XOR_B32_term: {
257 MI.setDesc(
TII->get(AMDGPU::S_XOR_B32));
260 case AMDGPU::S_OR_B64_term: {
263 MI.setDesc(
TII->get(AMDGPU::S_OR_B64));
266 case AMDGPU::S_OR_B32_term: {
269 MI.setDesc(
TII->get(AMDGPU::S_OR_B32));
272 case AMDGPU::S_ANDN2_B64_term: {
275 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B64));
278 case AMDGPU::S_ANDN2_B32_term: {
281 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B32));
284 case AMDGPU::S_AND_B64_term: {
287 MI.setDesc(
TII->get(AMDGPU::S_AND_B64));
290 case AMDGPU::S_AND_B32_term: {
293 MI.setDesc(
TII->get(AMDGPU::S_AND_B32));
310 for (;
I !=
E; ++
I) {
311 if (!
I->isTerminator())
312 return Seen ? FirstNonTerm :
I;
314 if (removeTerminatorBit(*
I)) {
327 const unsigned InstLimit = 25;
330 for (
unsigned N = 0;
N <= InstLimit &&
I !=
E; ++
I, ++
N) {
331 Register CopyFromExec = isCopyFromExec(*
I);
344 if (Succ->isLiveIn(
Reg))
360 unsigned MaxInstructions)
const {
363 unsigned CurrentIteration = 0;
365 for (++
A; CurrentIteration < MaxInstructions &&
A !=
E; ++
A) {
366 if (
A->isDebugInstr())
373 if (
A->modifiesRegister(
Reg,
TRI))
380 if (Terminator && KillFlagCandidates &&
A != Terminator &&
383 if (MO.isReg() && MO.isKill()) {
385 if (Candidate !=
Reg &&
TRI->regsOverlap(Candidate,
Reg))
404bool SIOptimizeExecMasking::isRegisterInUseBetween(
MachineInstr &Stop,
408 bool IgnoreStart)
const {
422 return !LR.available(
Reg) ||
MRI->isReserved(
Reg);
427bool SIOptimizeExecMasking::isRegisterInUseAfter(
MachineInstr &Stop,
442bool SIOptimizeExecMasking::optimizeExecSequence() {
454 unsigned SearchCount = 0;
455 const unsigned SearchLimit = 5;
456 while (
I !=
E && SearchCount++ < SearchLimit) {
457 CopyToExec = isCopyToExec(*
I);
467 auto *CopyToExecInst = &*
I;
468 auto CopyFromExecInst = findExecCopy(
MBB,
I);
469 if (CopyFromExecInst ==
E) {
470 auto PrepareExecInst = std::next(
I);
471 if (PrepareExecInst ==
E)
474 if (CopyToExecInst->getOperand(1).isKill() &&
478 PrepareExecInst->getOperand(0).setReg(LMC.ExecReg);
482 CopyToExecInst->eraseFromParent();
495 Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
500 J = std::next(CopyFromExecInst->getIterator()),
501 JE =
I->getIterator();
503 if (SaveExecInst && J->readsRegister(LMC.ExecReg,
TRI)) {
504 LLVM_DEBUG(
dbgs() <<
"exec read prevents saveexec: " << *J <<
'\n');
507 SaveExecInst =
nullptr;
511 bool ReadsCopyFromExec = J->readsRegister(CopyFromExec,
TRI);
513 if (J->modifiesRegister(CopyToExec,
TRI)) {
517 SaveExecInst =
nullptr;
522 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
525 if (ReadsCopyFromExec) {
527 LLVM_DEBUG(
dbgs() <<
"Found save exec op: " << *SaveExecInst <<
'\n');
530 LLVM_DEBUG(
dbgs() <<
"Instruction does not read exec copy: " << *J
534 if (ReadsCopyFromExec && !SaveExecInst) {
543 LLVM_DEBUG(
dbgs() <<
"Found second use of save inst candidate: " << *J
548 if (SaveExecInst && J->readsRegister(CopyToExec,
TRI)) {
549 assert(SaveExecInst != &*J);
557 LLVM_DEBUG(
dbgs() <<
"Insert save exec op: " << *SaveExecInst <<
'\n');
564 if (Src0.
isReg() && Src0.
getReg() == CopyFromExec) {
566 }
else if (Src1.
isReg() && Src1.
getReg() == CopyFromExec) {
574 CopyFromExecInst->eraseFromParent();
584 CopyToExecInst->eraseFromParent();
587 OtherInst->substituteRegister(CopyToExec, LMC.ExecReg,
588 AMDGPU::NoSubRegister, *
TRI);
599bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
613 bool IsSGPR32 =
TRI->getRegSizeInBits(MoveDest, *
MRI) == 32;
614 unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
625 auto TryAddImmediateValueFromNamedOperand =
626 [&](AMDGPU::OpName OperandName) ->
void {
627 if (
auto *
Mod =
TII->getNamedOperand(VCmp, OperandName))
628 Builder.addImm(
Mod->getImm());
631 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
634 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
637 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
639 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::op_sel);
648 MO->setIsKill(
false);
663void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
665 if (!
ST->hasGFX10_3Insts())
668 if (
MI.getOpcode() != LMC.AndSaveExecOpc)
671 Register SaveExecDest =
MI.getOperand(0).getReg();
672 if (!
TRI->isSGPRReg(*
MRI, SaveExecDest))
676 if (!SaveExecSrc0->
isReg())
688 VCmp = findInstrBackwards(
694 {LMC.ExecReg, SaveExecSrc0->
getReg()});
700 assert(VCmpDest &&
"Should have an sdst operand!");
722 if (isRegisterInUseBetween(*VCmp,
MI, VCmpDest->
getReg(),
false,
true) ||
723 isRegisterInUseAfter(
MI, VCmpDest->
getReg()))
737 if (!findInstrBackwards(
739 VCmp, &KillFlagCandidates))
743 SaveExecVCmpMapping[&
MI] = VCmp;
751void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(
MachineInstr &
MI) {
752 if (
MI.getOpcode() == LMC.XorOpc && &
MI != &
MI.getParent()->front()) {
759 (XorSrc0.
getReg() == LMC.ExecReg || XorSrc1.
getReg() == LMC.ExecReg)) {
764 if (PossibleOrSaveexec.
getOpcode() != LMC.OrSaveExecOpc)
770 if ((XorSrc0.
getReg() == LMC.ExecReg &&
773 XorSrc1.
getReg() == LMC.ExecReg)) {
774 OrXors.emplace_back(&PossibleOrSaveexec, &
MI);
781bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
782 if (OrXors.empty()) {
788 for (
const auto &Pair : OrXors) {
791 std::tie(
Or,
Xor) = Pair;
792 BuildMI(*
Or->getParent(),
Or->getIterator(),
Or->getDebugLoc(),
793 TII->get(LMC.AndN2SaveExecOpc),
Or->getOperand(0).getReg())
794 .
addReg(
Or->getOperand(1).getReg());
796 Or->eraseFromParent();
797 Xor->eraseFromParent();
805bool SIOptimizeExecMaskingLegacy::runOnMachineFunction(
MachineFunction &MF) {
809 return SIOptimizeExecMasking(&MF).run();
812bool SIOptimizeExecMasking::run() {
813 bool Changed = optimizeExecSequence();
816 SaveExecVCmpMapping.clear();
817 KillFlagCandidates.clear();
818 static unsigned SearchWindow = 10;
820 unsigned SearchCount = 0;
823 if (
MI.isDebugInstr())
826 if (SearchCount >= SearchWindow) {
830 tryRecordOrSaveexecXorSequence(
MI);
831 tryRecordVCmpxAndSaveexecSequence(
MI);
833 if (
MI.modifiesRegister(LMC.ExecReg,
TRI)) {
841 Changed |= optimizeOrSaveexecXorSequences();
842 for (
const auto &Entry : SaveExecVCmpMapping) {
846 Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr);
unsigned const MachineRegisterInfo * MRI
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
Register const TargetRegisterInfo * TRI
Promote Memory to Register
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static unsigned getSaveExecOp(unsigned Opc)
static Register isLogicalOpOnExec(const MachineInstr &MI)
If MI is a logical operation on an exec value, return the register copied to.
static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg)
Interface definition for SIRegisterInfo.
This file defines the SmallVector class.
static const LaneMaskConstants & get(const GCNSubtarget &ST)
Represent the analysis usage information of a pass.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Represents analyses that only rely on functions' control flow.
A set of register units used to track register liveness.
Wrapper class representing physical registers. Should be passed by value.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
reverse_iterator rbegin()
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z,...
mop_range uses()
Returns all operands which may be register uses.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
LLVM_ABI void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
reverse_self_iterator getReverseIterator()
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY int32_t getVCMPXOpFromVCMP(uint32_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
std::reverse_iterator< iterator > rend() const
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIOptimizeExecMaskingLegacyID
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
auto reverse(ContainerTy &&C)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ Mod
The access may modify the value stored in memory.
@ Xor
Bitwise or logical XOR of integers.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.