21#define DEBUG_TYPE "si-optimize-exec-masking"
47 bool IgnoreStart =
false)
const;
52 unsigned MaxInstructions = 20)
const;
53 bool optimizeExecSequence();
55 bool optimizeVCMPSaveExecSequence(
MachineInstr &SaveExecInstr,
59 bool optimizeOrSaveexecXorSequences();
71 return "SI optimize exec mask operations";
83 "SI optimize exec mask operations",
false,
false)
88char SIOptimizeExecMasking::
ID = 0;
94 switch (
MI.getOpcode()) {
96 case AMDGPU::S_MOV_B64:
97 case AMDGPU::S_MOV_B64_term:
98 case AMDGPU::S_MOV_B32:
99 case AMDGPU::S_MOV_B32_term: {
101 if (Src.isReg() && Src.getReg() == Exec)
102 return MI.getOperand(0).getReg();
106 return AMDGPU::NoRegister;
111 switch (
MI.getOpcode()) {
113 case AMDGPU::S_MOV_B64:
114 case AMDGPU::S_MOV_B32: {
116 if (Dst.isReg() && Dst.getReg() == Exec &&
MI.getOperand(1).isReg())
117 return MI.getOperand(1).getReg();
120 case AMDGPU::S_MOV_B64_term:
121 case AMDGPU::S_MOV_B32_term:
131 switch (
MI.getOpcode()) {
132 case AMDGPU::S_AND_B64:
133 case AMDGPU::S_OR_B64:
134 case AMDGPU::S_XOR_B64:
135 case AMDGPU::S_ANDN2_B64:
136 case AMDGPU::S_ORN2_B64:
137 case AMDGPU::S_NAND_B64:
138 case AMDGPU::S_NOR_B64:
139 case AMDGPU::S_XNOR_B64: {
142 return MI.getOperand(0).getReg();
145 return MI.getOperand(0).getReg();
148 case AMDGPU::S_AND_B32:
149 case AMDGPU::S_OR_B32:
150 case AMDGPU::S_XOR_B32:
151 case AMDGPU::S_ANDN2_B32:
152 case AMDGPU::S_ORN2_B32:
153 case AMDGPU::S_NAND_B32:
154 case AMDGPU::S_NOR_B32:
155 case AMDGPU::S_XNOR_B32: {
157 if (Src1.
isReg() && Src1.
getReg() == AMDGPU::EXEC_LO)
158 return MI.getOperand(0).getReg();
160 if (Src2.
isReg() && Src2.
getReg() == AMDGPU::EXEC_LO)
161 return MI.getOperand(0).getReg();
166 return AMDGPU::NoRegister;
171 case AMDGPU::S_AND_B64:
172 return AMDGPU::S_AND_SAVEEXEC_B64;
173 case AMDGPU::S_OR_B64:
174 return AMDGPU::S_OR_SAVEEXEC_B64;
175 case AMDGPU::S_XOR_B64:
176 return AMDGPU::S_XOR_SAVEEXEC_B64;
177 case AMDGPU::S_ANDN2_B64:
178 return AMDGPU::S_ANDN2_SAVEEXEC_B64;
179 case AMDGPU::S_ORN2_B64:
180 return AMDGPU::S_ORN2_SAVEEXEC_B64;
181 case AMDGPU::S_NAND_B64:
182 return AMDGPU::S_NAND_SAVEEXEC_B64;
183 case AMDGPU::S_NOR_B64:
184 return AMDGPU::S_NOR_SAVEEXEC_B64;
185 case AMDGPU::S_XNOR_B64:
186 return AMDGPU::S_XNOR_SAVEEXEC_B64;
187 case AMDGPU::S_AND_B32:
188 return AMDGPU::S_AND_SAVEEXEC_B32;
189 case AMDGPU::S_OR_B32:
190 return AMDGPU::S_OR_SAVEEXEC_B32;
191 case AMDGPU::S_XOR_B32:
192 return AMDGPU::S_XOR_SAVEEXEC_B32;
193 case AMDGPU::S_ANDN2_B32:
194 return AMDGPU::S_ANDN2_SAVEEXEC_B32;
195 case AMDGPU::S_ORN2_B32:
196 return AMDGPU::S_ORN2_SAVEEXEC_B32;
197 case AMDGPU::S_NAND_B32:
198 return AMDGPU::S_NAND_SAVEEXEC_B32;
199 case AMDGPU::S_NOR_B32:
200 return AMDGPU::S_NOR_SAVEEXEC_B32;
201 case AMDGPU::S_XNOR_B32:
202 return AMDGPU::S_XNOR_SAVEEXEC_B32;
204 return AMDGPU::INSTRUCTION_LIST_END;
210bool SIOptimizeExecMasking::removeTerminatorBit(
MachineInstr &
MI)
const {
211 switch (
MI.getOpcode()) {
212 case AMDGPU::S_MOV_B32_term: {
213 bool RegSrc =
MI.getOperand(1).isReg();
214 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
217 case AMDGPU::S_MOV_B64_term: {
218 bool RegSrc =
MI.getOperand(1).isReg();
219 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
222 case AMDGPU::S_XOR_B64_term: {
225 MI.setDesc(
TII->get(AMDGPU::S_XOR_B64));
228 case AMDGPU::S_XOR_B32_term: {
231 MI.setDesc(
TII->get(AMDGPU::S_XOR_B32));
234 case AMDGPU::S_OR_B64_term: {
237 MI.setDesc(
TII->get(AMDGPU::S_OR_B64));
240 case AMDGPU::S_OR_B32_term: {
243 MI.setDesc(
TII->get(AMDGPU::S_OR_B32));
246 case AMDGPU::S_ANDN2_B64_term: {
249 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B64));
252 case AMDGPU::S_ANDN2_B32_term: {
255 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B32));
258 case AMDGPU::S_AND_B64_term: {
261 MI.setDesc(
TII->get(AMDGPU::S_AND_B64));
264 case AMDGPU::S_AND_B32_term: {
267 MI.setDesc(
TII->get(AMDGPU::S_AND_B32));
284 for (;
I !=
E; ++
I) {
285 if (!
I->isTerminator())
286 return Seen ? FirstNonTerm :
I;
288 if (removeTerminatorBit(*
I)) {
301 const unsigned InstLimit = 25;
304 for (
unsigned N = 0;
N <= InstLimit &&
I !=
E; ++
I, ++
N) {
305 Register CopyFromExec = isCopyFromExec(*
I);
318 if (Succ->isLiveIn(Reg))
335 unsigned CurrentIteration = 0;
337 for (++
A; CurrentIteration < MaxInstructions &&
A !=
E; ++
A) {
338 if (
A->isDebugInstr())
345 if (
A->modifiesRegister(Reg,
TRI))
361bool SIOptimizeExecMasking::isRegisterInUseBetween(
MachineInstr &Stop,
365 bool IgnoreStart)
const {
379 return !LR.available(*
MRI, Reg);
384bool SIOptimizeExecMasking::isRegisterInUseAfter(
MachineInstr &Stop,
386 return isRegisterInUseBetween(Stop, *Stop.
getParent()->
rbegin(), Reg,
true);
399bool SIOptimizeExecMasking::optimizeExecSequence() {
400 bool Changed =
false;
411 unsigned SearchCount = 0;
412 const unsigned SearchLimit = 5;
413 while (
I !=
E && SearchCount++ < SearchLimit) {
414 CopyToExec = isCopyToExec(*
I);
424 auto *CopyToExecInst = &*
I;
425 auto CopyFromExecInst = findExecCopy(
MBB,
I);
426 if (CopyFromExecInst ==
E) {
427 auto PrepareExecInst = std::next(
I);
428 if (PrepareExecInst ==
E)
431 if (CopyToExecInst->getOperand(1).isKill() &&
435 PrepareExecInst->getOperand(0).setReg(Exec);
439 CopyToExecInst->eraseFromParent();
452 Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
457 J = std::next(CopyFromExecInst->getIterator()),
458 JE =
I->getIterator();
460 if (SaveExecInst && J->readsRegister(Exec,
TRI)) {
461 LLVM_DEBUG(
dbgs() <<
"exec read prevents saveexec: " << *J <<
'\n');
464 SaveExecInst =
nullptr;
470 if (J->modifiesRegister(CopyToExec,
TRI)) {
474 SaveExecInst =
nullptr;
479 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
482 if (ReadsCopyFromExec) {
484 LLVM_DEBUG(
dbgs() <<
"Found save exec op: " << *SaveExecInst <<
'\n');
488 <<
"Instruction does not read exec copy: " << *J <<
'\n');
491 }
else if (ReadsCopyFromExec && !SaveExecInst) {
500 LLVM_DEBUG(
dbgs() <<
"Found second use of save inst candidate: " << *J
505 if (SaveExecInst && J->readsRegister(CopyToExec,
TRI)) {
506 assert(SaveExecInst != &*J);
514 LLVM_DEBUG(
dbgs() <<
"Insert save exec op: " << *SaveExecInst <<
'\n');
521 if (Src0.
isReg() && Src0.
getReg() == CopyFromExec) {
523 }
else if (Src1.
isReg() && Src1.
getReg() == CopyFromExec) {
531 CopyFromExecInst->eraseFromParent();
541 CopyToExecInst->eraseFromParent();
544 OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister,
556bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
569 if (!SaveExecInstr.
uses().empty()) {
570 bool IsSGPR32 =
TRI->getRegSizeInBits(MoveDest, *
MRI) == 32;
571 unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
582 auto TryAddImmediateValueFromNamedOperand =
583 [&](
unsigned OperandName) ->
void {
584 if (
auto *
Mod =
TII->getNamedOperand(VCmp, OperandName))
588 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
591 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
594 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
615void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
617 if (!
ST->hasGFX10_3Insts())
620 const unsigned AndSaveExecOpcode =
621 ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
623 if (
MI.getOpcode() != AndSaveExecOpcode)
626 Register SaveExecDest =
MI.getOperand(0).getReg();
627 if (!
TRI->isSGPRReg(*
MRI, SaveExecDest))
631 if (!SaveExecSrc0->
isReg())
643 VCmp = findInstrBackwards(
655 assert(VCmpDest &&
"Should have an sdst operand!");
677 if (isRegisterInUseBetween(*VCmp,
MI, VCmpDest->
getReg(),
false,
true) ||
678 isRegisterInUseAfter(
MI, VCmpDest->
getReg()))
692 if (!findInstrBackwards(
697 SaveExecVCmpMapping[&
MI] = VCmp;
705void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(
MachineInstr &
MI) {
706 const unsigned XorOpcode =
707 ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
709 if (
MI.getOpcode() == XorOpcode && &
MI != &
MI.getParent()->front()) {
716 (XorSrc0.
getReg() == Exec || XorSrc1.
getReg() == Exec)) {
717 const unsigned OrSaveexecOpcode =
ST->isWave32()
718 ? AMDGPU::S_OR_SAVEEXEC_B32
719 : AMDGPU::S_OR_SAVEEXEC_B64;
724 if (PossibleOrSaveexec.
getOpcode() != OrSaveexecOpcode)
732 OrXors.emplace_back(&PossibleOrSaveexec, &
MI);
739bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
740 if (OrXors.empty()) {
744 bool Changed =
false;
745 const unsigned Andn2Opcode =
ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
746 : AMDGPU::S_ANDN2_SAVEEXEC_B64;
748 for (
const auto &Pair : OrXors) {
751 std::tie(
Or,
Xor) = Pair;
752 BuildMI(*
Or->getParent(),
Or->getIterator(),
Or->getDebugLoc(),
753 TII->get(Andn2Opcode),
Or->getOperand(0).getReg())
754 .
addReg(
Or->getOperand(1).getReg());
756 Or->eraseFromParent();
757 Xor->eraseFromParent();
771 TRI =
ST->getRegisterInfo();
772 TII =
ST->getInstrInfo();
776 bool Changed = optimizeExecSequence();
779 SaveExecVCmpMapping.clear();
780 static unsigned SearchWindow = 10;
782 unsigned SearchCount = 0;
785 if (
MI.isDebugInstr())
788 if (SearchCount >= SearchWindow) {
792 tryRecordOrSaveexecXorSequence(
MI);
793 tryRecordVCmpxAndSaveexecSequence(
MI);
795 if (
MI.modifiesRegister(Exec,
TRI)) {
803 Changed |= optimizeOrSaveexecXorSequences();
804 for (
const auto &Entry : SaveExecVCmpMapping) {
808 Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getSaveExecOp(unsigned Opc)
SI optimize exec mask operations
static Register isLogicalOpOnExec(const MachineInstr &MI)
If MI is a logical operation on an exec value, return the register copied to.
static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg)
Interface definition for SIRegisterInfo.
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
A set of physical registers with utility functions to track liveness when walking backward/forward th...
Wrapper class representing physical registers. Should be passed by value.
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i....
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z,...
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Iterator for intrusive lists based on ilist_node.
reverse_self_iterator getReverseIterator()
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY int getVCMPXOpFromVCMP(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
reverse_iterator rend(StringRef path)
Get reverse end iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ Or
Bitwise or logical OR of integers.
@ Xor
Bitwise or logical XOR of integers.
void initializeSIOptimizeExecMaskingPass(PassRegistry &)
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
char & SIOptimizeExecMaskingID