23#define DEBUG_TYPE "si-optimize-exec-masking"
27class SIOptimizeExecMasking {
49 bool IgnoreStart =
false)
const;
56 unsigned MaxInstructions = 20)
const;
57 bool optimizeExecSequence();
59 bool optimizeVCMPSaveExecSequence(
MachineInstr &SaveExecInstr,
63 bool optimizeOrSaveexecXorSequences();
80 return "SI optimize exec mask operations";
94 SIOptimizeExecMasking Impl;
105 "SI optimize exec mask operations",
false,
false)
110char SIOptimizeExecMaskingLegacy::
ID = 0;
116 switch (
MI.getOpcode()) {
118 case AMDGPU::S_MOV_B64:
119 case AMDGPU::S_MOV_B64_term:
120 case AMDGPU::S_MOV_B32:
121 case AMDGPU::S_MOV_B32_term: {
123 if (Src.isReg() && Src.getReg() == Exec)
124 return MI.getOperand(0).getReg();
128 return AMDGPU::NoRegister;
133 switch (
MI.getOpcode()) {
135 case AMDGPU::S_MOV_B64:
136 case AMDGPU::S_MOV_B32: {
138 if (Dst.isReg() && Dst.getReg() == Exec &&
MI.getOperand(1).isReg())
139 return MI.getOperand(1).getReg();
142 case AMDGPU::S_MOV_B64_term:
143 case AMDGPU::S_MOV_B32_term:
153 switch (
MI.getOpcode()) {
154 case AMDGPU::S_AND_B64:
155 case AMDGPU::S_OR_B64:
156 case AMDGPU::S_XOR_B64:
157 case AMDGPU::S_ANDN2_B64:
158 case AMDGPU::S_ORN2_B64:
159 case AMDGPU::S_NAND_B64:
160 case AMDGPU::S_NOR_B64:
161 case AMDGPU::S_XNOR_B64: {
164 return MI.getOperand(0).getReg();
167 return MI.getOperand(0).getReg();
170 case AMDGPU::S_AND_B32:
171 case AMDGPU::S_OR_B32:
172 case AMDGPU::S_XOR_B32:
173 case AMDGPU::S_ANDN2_B32:
174 case AMDGPU::S_ORN2_B32:
175 case AMDGPU::S_NAND_B32:
176 case AMDGPU::S_NOR_B32:
177 case AMDGPU::S_XNOR_B32: {
179 if (Src1.
isReg() && Src1.
getReg() == AMDGPU::EXEC_LO)
180 return MI.getOperand(0).getReg();
182 if (Src2.
isReg() && Src2.
getReg() == AMDGPU::EXEC_LO)
183 return MI.getOperand(0).getReg();
188 return AMDGPU::NoRegister;
193 case AMDGPU::S_AND_B64:
194 return AMDGPU::S_AND_SAVEEXEC_B64;
195 case AMDGPU::S_OR_B64:
196 return AMDGPU::S_OR_SAVEEXEC_B64;
197 case AMDGPU::S_XOR_B64:
198 return AMDGPU::S_XOR_SAVEEXEC_B64;
199 case AMDGPU::S_ANDN2_B64:
200 return AMDGPU::S_ANDN2_SAVEEXEC_B64;
201 case AMDGPU::S_ORN2_B64:
202 return AMDGPU::S_ORN2_SAVEEXEC_B64;
203 case AMDGPU::S_NAND_B64:
204 return AMDGPU::S_NAND_SAVEEXEC_B64;
205 case AMDGPU::S_NOR_B64:
206 return AMDGPU::S_NOR_SAVEEXEC_B64;
207 case AMDGPU::S_XNOR_B64:
208 return AMDGPU::S_XNOR_SAVEEXEC_B64;
209 case AMDGPU::S_AND_B32:
210 return AMDGPU::S_AND_SAVEEXEC_B32;
211 case AMDGPU::S_OR_B32:
212 return AMDGPU::S_OR_SAVEEXEC_B32;
213 case AMDGPU::S_XOR_B32:
214 return AMDGPU::S_XOR_SAVEEXEC_B32;
215 case AMDGPU::S_ANDN2_B32:
216 return AMDGPU::S_ANDN2_SAVEEXEC_B32;
217 case AMDGPU::S_ORN2_B32:
218 return AMDGPU::S_ORN2_SAVEEXEC_B32;
219 case AMDGPU::S_NAND_B32:
220 return AMDGPU::S_NAND_SAVEEXEC_B32;
221 case AMDGPU::S_NOR_B32:
222 return AMDGPU::S_NOR_SAVEEXEC_B32;
223 case AMDGPU::S_XNOR_B32:
224 return AMDGPU::S_XNOR_SAVEEXEC_B32;
226 return AMDGPU::INSTRUCTION_LIST_END;
232bool SIOptimizeExecMasking::removeTerminatorBit(
MachineInstr &
MI)
const {
233 switch (
MI.getOpcode()) {
234 case AMDGPU::S_MOV_B32_term: {
235 bool RegSrc =
MI.getOperand(1).isReg();
236 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
239 case AMDGPU::S_MOV_B64_term: {
240 bool RegSrc =
MI.getOperand(1).isReg();
241 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
244 case AMDGPU::S_XOR_B64_term: {
247 MI.setDesc(
TII->get(AMDGPU::S_XOR_B64));
250 case AMDGPU::S_XOR_B32_term: {
253 MI.setDesc(
TII->get(AMDGPU::S_XOR_B32));
256 case AMDGPU::S_OR_B64_term: {
259 MI.setDesc(
TII->get(AMDGPU::S_OR_B64));
262 case AMDGPU::S_OR_B32_term: {
265 MI.setDesc(
TII->get(AMDGPU::S_OR_B32));
268 case AMDGPU::S_ANDN2_B64_term: {
271 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B64));
274 case AMDGPU::S_ANDN2_B32_term: {
277 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B32));
280 case AMDGPU::S_AND_B64_term: {
283 MI.setDesc(
TII->get(AMDGPU::S_AND_B64));
286 case AMDGPU::S_AND_B32_term: {
289 MI.setDesc(
TII->get(AMDGPU::S_AND_B32));
306 for (;
I != E; ++
I) {
307 if (!
I->isTerminator())
308 return Seen ? FirstNonTerm :
I;
310 if (removeTerminatorBit(*
I)) {
323 const unsigned InstLimit = 25;
326 for (
unsigned N = 0;
N <= InstLimit &&
I != E; ++
I, ++
N) {
327 Register CopyFromExec = isCopyFromExec(*
I);
340 if (Succ->isLiveIn(Reg))
356 unsigned MaxInstructions)
const {
359 unsigned CurrentIteration = 0;
361 for (++
A; CurrentIteration < MaxInstructions &&
A != E; ++
A) {
362 if (
A->isDebugInstr())
369 if (
A->modifiesRegister(Reg,
TRI))
376 if (Terminator && KillFlagCandidates &&
A != Terminator &&
377 A->killsRegister(Reg,
TRI)) {
379 if (MO.isReg() && MO.isKill()) {
381 if (Candidate != Reg &&
TRI->regsOverlap(Candidate, Reg))
400bool SIOptimizeExecMasking::isRegisterInUseBetween(
MachineInstr &Stop,
404 bool IgnoreStart)
const {
418 return !LR.available(Reg) ||
MRI->isReserved(Reg);
423bool SIOptimizeExecMasking::isRegisterInUseAfter(
MachineInstr &Stop,
425 return isRegisterInUseBetween(Stop, *Stop.
getParent()->
rbegin(), Reg,
true);
438bool SIOptimizeExecMasking::optimizeExecSequence() {
439 bool Changed =
false;
450 unsigned SearchCount = 0;
451 const unsigned SearchLimit = 5;
452 while (
I != E && SearchCount++ < SearchLimit) {
453 CopyToExec = isCopyToExec(*
I);
463 auto *CopyToExecInst = &*
I;
464 auto CopyFromExecInst = findExecCopy(
MBB,
I);
465 if (CopyFromExecInst == E) {
466 auto PrepareExecInst = std::next(
I);
467 if (PrepareExecInst == E)
470 if (CopyToExecInst->getOperand(1).isKill() &&
474 PrepareExecInst->getOperand(0).setReg(Exec);
478 CopyToExecInst->eraseFromParent();
491 Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
496 J = std::next(CopyFromExecInst->getIterator()),
497 JE =
I->getIterator();
499 if (SaveExecInst && J->readsRegister(Exec,
TRI)) {
500 LLVM_DEBUG(
dbgs() <<
"exec read prevents saveexec: " << *J <<
'\n');
503 SaveExecInst =
nullptr;
509 if (J->modifiesRegister(CopyToExec,
TRI)) {
513 SaveExecInst =
nullptr;
518 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
521 if (ReadsCopyFromExec) {
523 LLVM_DEBUG(
dbgs() <<
"Found save exec op: " << *SaveExecInst <<
'\n');
526 LLVM_DEBUG(
dbgs() <<
"Instruction does not read exec copy: " << *J
530 if (ReadsCopyFromExec && !SaveExecInst) {
539 LLVM_DEBUG(
dbgs() <<
"Found second use of save inst candidate: " << *J
544 if (SaveExecInst && J->readsRegister(CopyToExec,
TRI)) {
545 assert(SaveExecInst != &*J);
553 LLVM_DEBUG(
dbgs() <<
"Insert save exec op: " << *SaveExecInst <<
'\n');
560 if (Src0.
isReg() && Src0.
getReg() == CopyFromExec) {
562 }
else if (Src1.
isReg() && Src1.
getReg() == CopyFromExec) {
570 CopyFromExecInst->eraseFromParent();
580 CopyToExecInst->eraseFromParent();
583 OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister,
595bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
608 if (!SaveExecInstr.
uses().empty()) {
609 bool IsSGPR32 =
TRI->getRegSizeInBits(MoveDest, *
MRI) == 32;
610 unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
621 auto TryAddImmediateValueFromNamedOperand =
622 [&](
unsigned OperandName) ->
void {
623 if (
auto *
Mod =
TII->getNamedOperand(VCmp, OperandName))
624 Builder.addImm(
Mod->getImm());
627 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
630 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
633 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
642 MO->setIsKill(
false);
657void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
659 if (!
ST->hasGFX10_3Insts())
662 const unsigned AndSaveExecOpcode =
663 ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
665 if (
MI.getOpcode() != AndSaveExecOpcode)
668 Register SaveExecDest =
MI.getOperand(0).getReg();
669 if (!
TRI->isSGPRReg(*
MRI, SaveExecDest))
673 if (!SaveExecSrc0->
isReg())
685 VCmp = findInstrBackwards(
697 assert(VCmpDest &&
"Should have an sdst operand!");
719 if (isRegisterInUseBetween(*VCmp,
MI, VCmpDest->
getReg(),
false,
true) ||
720 isRegisterInUseAfter(
MI, VCmpDest->
getReg()))
734 if (!findInstrBackwards(
736 VCmp, &KillFlagCandidates))
740 SaveExecVCmpMapping[&
MI] = VCmp;
748void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(
MachineInstr &
MI) {
749 const unsigned XorOpcode =
750 ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
752 if (
MI.getOpcode() == XorOpcode && &
MI != &
MI.getParent()->front()) {
759 (XorSrc0.
getReg() == Exec || XorSrc1.
getReg() == Exec)) {
760 const unsigned OrSaveexecOpcode =
ST->isWave32()
761 ? AMDGPU::S_OR_SAVEEXEC_B32
762 : AMDGPU::S_OR_SAVEEXEC_B64;
767 if (PossibleOrSaveexec.
getOpcode() != OrSaveexecOpcode)
775 OrXors.emplace_back(&PossibleOrSaveexec, &
MI);
782bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
783 if (OrXors.empty()) {
787 bool Changed =
false;
788 const unsigned Andn2Opcode =
ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
789 : AMDGPU::S_ANDN2_SAVEEXEC_B64;
791 for (
const auto &Pair : OrXors) {
794 std::tie(
Or,
Xor) = Pair;
795 BuildMI(*
Or->getParent(),
Or->getIterator(),
Or->getDebugLoc(),
796 TII->get(Andn2Opcode),
Or->getOperand(0).getReg())
797 .
addReg(
Or->getOperand(1).getReg());
799 Or->eraseFromParent();
800 Xor->eraseFromParent();
808bool SIOptimizeExecMaskingLegacy::runOnMachineFunction(
MachineFunction &MF) {
812 return SIOptimizeExecMasking().run(MF);
818 TRI =
ST->getRegisterInfo();
819 TII =
ST->getInstrInfo();
823 bool Changed = optimizeExecSequence();
826 SaveExecVCmpMapping.clear();
827 KillFlagCandidates.clear();
828 static unsigned SearchWindow = 10;
830 unsigned SearchCount = 0;
833 if (
MI.isDebugInstr())
836 if (SearchCount >= SearchWindow) {
840 tryRecordOrSaveexecXorSequence(
MI);
841 tryRecordVCmpxAndSaveexecSequence(
MI);
843 if (
MI.modifiesRegister(Exec,
TRI)) {
851 Changed |= optimizeOrSaveexecXorSequences();
852 for (
const auto &Entry : SaveExecVCmpMapping) {
856 Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getSaveExecOp(unsigned Opc)
SI optimize exec mask operations
static Register isLogicalOpOnExec(const MachineInstr &MI)
If MI is a logical operation on an exec value, return the register copied to.
static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg)
Interface definition for SIRegisterInfo.
This file defines the SmallVector class.
A container for analyses that lazily runs them and caches their results.
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Represents analyses that only rely on functions' control flow.
A set of register units used to track register liveness.
Wrapper class representing physical registers. Should be passed by value.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
iterator_range< mop_iterator > uses()
Returns a range that includes all operands which may be register uses.
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z,...
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
reverse_self_iterator getReverseIterator()
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY int getVCMPXOpFromVCMP(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
reverse_iterator rend(StringRef path LLVM_LIFETIME_BOUND)
Get reverse end iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIOptimizeExecMaskingLegacyID
PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ Mod
The access may modify the value stored in memory.
@ Or
Bitwise or logical OR of integers.
@ Xor
Bitwise or logical XOR of integers.
void initializeSIOptimizeExecMaskingLegacyPass(PassRegistry &)
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.