22#define DEBUG_TYPE "si-optimize-exec-masking"
48 bool IgnoreStart =
false)
const;
55 unsigned MaxInstructions = 20)
const;
56 bool optimizeExecSequence();
58 bool optimizeVCMPSaveExecSequence(
MachineInstr &SaveExecInstr,
62 bool optimizeOrSaveexecXorSequences();
74 return "SI optimize exec mask operations";
86 "SI optimize exec mask operations",
false,
false)
91char SIOptimizeExecMasking::
ID = 0;
97 switch (
MI.getOpcode()) {
99 case AMDGPU::S_MOV_B64:
100 case AMDGPU::S_MOV_B64_term:
101 case AMDGPU::S_MOV_B32:
102 case AMDGPU::S_MOV_B32_term: {
104 if (Src.isReg() && Src.getReg() == Exec)
105 return MI.getOperand(0).getReg();
109 return AMDGPU::NoRegister;
114 switch (
MI.getOpcode()) {
116 case AMDGPU::S_MOV_B64:
117 case AMDGPU::S_MOV_B32: {
119 if (Dst.isReg() && Dst.getReg() == Exec &&
MI.getOperand(1).isReg())
120 return MI.getOperand(1).getReg();
123 case AMDGPU::S_MOV_B64_term:
124 case AMDGPU::S_MOV_B32_term:
134 switch (
MI.getOpcode()) {
135 case AMDGPU::S_AND_B64:
136 case AMDGPU::S_OR_B64:
137 case AMDGPU::S_XOR_B64:
138 case AMDGPU::S_ANDN2_B64:
139 case AMDGPU::S_ORN2_B64:
140 case AMDGPU::S_NAND_B64:
141 case AMDGPU::S_NOR_B64:
142 case AMDGPU::S_XNOR_B64: {
145 return MI.getOperand(0).getReg();
148 return MI.getOperand(0).getReg();
151 case AMDGPU::S_AND_B32:
152 case AMDGPU::S_OR_B32:
153 case AMDGPU::S_XOR_B32:
154 case AMDGPU::S_ANDN2_B32:
155 case AMDGPU::S_ORN2_B32:
156 case AMDGPU::S_NAND_B32:
157 case AMDGPU::S_NOR_B32:
158 case AMDGPU::S_XNOR_B32: {
160 if (Src1.
isReg() && Src1.
getReg() == AMDGPU::EXEC_LO)
161 return MI.getOperand(0).getReg();
163 if (Src2.
isReg() && Src2.
getReg() == AMDGPU::EXEC_LO)
164 return MI.getOperand(0).getReg();
169 return AMDGPU::NoRegister;
174 case AMDGPU::S_AND_B64:
175 return AMDGPU::S_AND_SAVEEXEC_B64;
176 case AMDGPU::S_OR_B64:
177 return AMDGPU::S_OR_SAVEEXEC_B64;
178 case AMDGPU::S_XOR_B64:
179 return AMDGPU::S_XOR_SAVEEXEC_B64;
180 case AMDGPU::S_ANDN2_B64:
181 return AMDGPU::S_ANDN2_SAVEEXEC_B64;
182 case AMDGPU::S_ORN2_B64:
183 return AMDGPU::S_ORN2_SAVEEXEC_B64;
184 case AMDGPU::S_NAND_B64:
185 return AMDGPU::S_NAND_SAVEEXEC_B64;
186 case AMDGPU::S_NOR_B64:
187 return AMDGPU::S_NOR_SAVEEXEC_B64;
188 case AMDGPU::S_XNOR_B64:
189 return AMDGPU::S_XNOR_SAVEEXEC_B64;
190 case AMDGPU::S_AND_B32:
191 return AMDGPU::S_AND_SAVEEXEC_B32;
192 case AMDGPU::S_OR_B32:
193 return AMDGPU::S_OR_SAVEEXEC_B32;
194 case AMDGPU::S_XOR_B32:
195 return AMDGPU::S_XOR_SAVEEXEC_B32;
196 case AMDGPU::S_ANDN2_B32:
197 return AMDGPU::S_ANDN2_SAVEEXEC_B32;
198 case AMDGPU::S_ORN2_B32:
199 return AMDGPU::S_ORN2_SAVEEXEC_B32;
200 case AMDGPU::S_NAND_B32:
201 return AMDGPU::S_NAND_SAVEEXEC_B32;
202 case AMDGPU::S_NOR_B32:
203 return AMDGPU::S_NOR_SAVEEXEC_B32;
204 case AMDGPU::S_XNOR_B32:
205 return AMDGPU::S_XNOR_SAVEEXEC_B32;
207 return AMDGPU::INSTRUCTION_LIST_END;
213bool SIOptimizeExecMasking::removeTerminatorBit(
MachineInstr &
MI)
const {
214 switch (
MI.getOpcode()) {
215 case AMDGPU::S_MOV_B32_term: {
216 bool RegSrc =
MI.getOperand(1).isReg();
217 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
220 case AMDGPU::S_MOV_B64_term: {
221 bool RegSrc =
MI.getOperand(1).isReg();
222 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
225 case AMDGPU::S_XOR_B64_term: {
228 MI.setDesc(
TII->get(AMDGPU::S_XOR_B64));
231 case AMDGPU::S_XOR_B32_term: {
234 MI.setDesc(
TII->get(AMDGPU::S_XOR_B32));
237 case AMDGPU::S_OR_B64_term: {
240 MI.setDesc(
TII->get(AMDGPU::S_OR_B64));
243 case AMDGPU::S_OR_B32_term: {
246 MI.setDesc(
TII->get(AMDGPU::S_OR_B32));
249 case AMDGPU::S_ANDN2_B64_term: {
252 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B64));
255 case AMDGPU::S_ANDN2_B32_term: {
258 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B32));
261 case AMDGPU::S_AND_B64_term: {
264 MI.setDesc(
TII->get(AMDGPU::S_AND_B64));
267 case AMDGPU::S_AND_B32_term: {
270 MI.setDesc(
TII->get(AMDGPU::S_AND_B32));
287 for (;
I != E; ++
I) {
288 if (!
I->isTerminator())
289 return Seen ? FirstNonTerm :
I;
291 if (removeTerminatorBit(*
I)) {
304 const unsigned InstLimit = 25;
307 for (
unsigned N = 0;
N <= InstLimit &&
I != E; ++
I, ++
N) {
308 Register CopyFromExec = isCopyFromExec(*
I);
321 if (Succ->isLiveIn(Reg))
337 unsigned MaxInstructions)
const {
340 unsigned CurrentIteration = 0;
342 for (++
A; CurrentIteration < MaxInstructions &&
A != E; ++
A) {
343 if (
A->isDebugInstr())
350 if (
A->modifiesRegister(Reg,
TRI))
357 if (Terminator && KillFlagCandidates &&
A != Terminator &&
358 A->killsRegister(Reg,
TRI)) {
360 if (MO.isReg() && MO.isKill()) {
362 if (Candidate != Reg &&
TRI->regsOverlap(Candidate, Reg))
381bool SIOptimizeExecMasking::isRegisterInUseBetween(
MachineInstr &Stop,
385 bool IgnoreStart)
const {
399 return !LR.available(Reg) ||
MRI->isReserved(Reg);
404bool SIOptimizeExecMasking::isRegisterInUseAfter(
MachineInstr &Stop,
406 return isRegisterInUseBetween(Stop, *Stop.
getParent()->
rbegin(), Reg,
true);
419bool SIOptimizeExecMasking::optimizeExecSequence() {
420 bool Changed =
false;
431 unsigned SearchCount = 0;
432 const unsigned SearchLimit = 5;
433 while (
I != E && SearchCount++ < SearchLimit) {
434 CopyToExec = isCopyToExec(*
I);
444 auto *CopyToExecInst = &*
I;
445 auto CopyFromExecInst = findExecCopy(
MBB,
I);
446 if (CopyFromExecInst == E) {
447 auto PrepareExecInst = std::next(
I);
448 if (PrepareExecInst == E)
451 if (CopyToExecInst->getOperand(1).isKill() &&
455 PrepareExecInst->getOperand(0).setReg(Exec);
459 CopyToExecInst->eraseFromParent();
472 Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
477 J = std::next(CopyFromExecInst->getIterator()),
478 JE =
I->getIterator();
480 if (SaveExecInst && J->readsRegister(Exec,
TRI)) {
481 LLVM_DEBUG(
dbgs() <<
"exec read prevents saveexec: " << *J <<
'\n');
484 SaveExecInst =
nullptr;
490 if (J->modifiesRegister(CopyToExec,
TRI)) {
494 SaveExecInst =
nullptr;
499 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
502 if (ReadsCopyFromExec) {
504 LLVM_DEBUG(
dbgs() <<
"Found save exec op: " << *SaveExecInst <<
'\n');
507 LLVM_DEBUG(
dbgs() <<
"Instruction does not read exec copy: " << *J
511 if (ReadsCopyFromExec && !SaveExecInst) {
520 LLVM_DEBUG(
dbgs() <<
"Found second use of save inst candidate: " << *J
525 if (SaveExecInst && J->readsRegister(CopyToExec,
TRI)) {
526 assert(SaveExecInst != &*J);
534 LLVM_DEBUG(
dbgs() <<
"Insert save exec op: " << *SaveExecInst <<
'\n');
541 if (Src0.
isReg() && Src0.
getReg() == CopyFromExec) {
543 }
else if (Src1.
isReg() && Src1.
getReg() == CopyFromExec) {
551 CopyFromExecInst->eraseFromParent();
561 CopyToExecInst->eraseFromParent();
564 OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister,
576bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
589 if (!SaveExecInstr.
uses().empty()) {
590 bool IsSGPR32 =
TRI->getRegSizeInBits(MoveDest, *
MRI) == 32;
591 unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
602 auto TryAddImmediateValueFromNamedOperand =
603 [&](
unsigned OperandName) ->
void {
604 if (
auto *
Mod =
TII->getNamedOperand(VCmp, OperandName))
605 Builder.addImm(
Mod->getImm());
608 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
611 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
614 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
623 MO->setIsKill(
false);
638void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
640 if (!
ST->hasGFX10_3Insts())
643 const unsigned AndSaveExecOpcode =
644 ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
646 if (
MI.getOpcode() != AndSaveExecOpcode)
649 Register SaveExecDest =
MI.getOperand(0).getReg();
650 if (!
TRI->isSGPRReg(*
MRI, SaveExecDest))
654 if (!SaveExecSrc0->
isReg())
666 VCmp = findInstrBackwards(
678 assert(VCmpDest &&
"Should have an sdst operand!");
700 if (isRegisterInUseBetween(*VCmp,
MI, VCmpDest->
getReg(),
false,
true) ||
701 isRegisterInUseAfter(
MI, VCmpDest->
getReg()))
715 if (!findInstrBackwards(
717 VCmp, &KillFlagCandidates))
721 SaveExecVCmpMapping[&
MI] = VCmp;
729void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(
MachineInstr &
MI) {
730 const unsigned XorOpcode =
731 ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
733 if (
MI.getOpcode() == XorOpcode && &
MI != &
MI.getParent()->front()) {
740 (XorSrc0.
getReg() == Exec || XorSrc1.
getReg() == Exec)) {
741 const unsigned OrSaveexecOpcode =
ST->isWave32()
742 ? AMDGPU::S_OR_SAVEEXEC_B32
743 : AMDGPU::S_OR_SAVEEXEC_B64;
748 if (PossibleOrSaveexec.
getOpcode() != OrSaveexecOpcode)
756 OrXors.emplace_back(&PossibleOrSaveexec, &
MI);
763bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
764 if (OrXors.empty()) {
768 bool Changed =
false;
769 const unsigned Andn2Opcode =
ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
770 : AMDGPU::S_ANDN2_SAVEEXEC_B64;
772 for (
const auto &Pair : OrXors) {
775 std::tie(
Or,
Xor) = Pair;
776 BuildMI(*
Or->getParent(),
Or->getIterator(),
Or->getDebugLoc(),
777 TII->get(Andn2Opcode),
Or->getOperand(0).getReg())
778 .
addReg(
Or->getOperand(1).getReg());
780 Or->eraseFromParent();
781 Xor->eraseFromParent();
795 TRI =
ST->getRegisterInfo();
796 TII =
ST->getInstrInfo();
800 bool Changed = optimizeExecSequence();
803 SaveExecVCmpMapping.clear();
804 KillFlagCandidates.clear();
805 static unsigned SearchWindow = 10;
807 unsigned SearchCount = 0;
810 if (
MI.isDebugInstr())
813 if (SearchCount >= SearchWindow) {
817 tryRecordOrSaveexecXorSequence(
MI);
818 tryRecordVCmpxAndSaveexecSequence(
MI);
820 if (
MI.modifiesRegister(Exec,
TRI)) {
828 Changed |= optimizeOrSaveexecXorSequences();
829 for (
const auto &Entry : SaveExecVCmpMapping) {
833 Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getSaveExecOp(unsigned Opc)
SI optimize exec mask operations
static Register isLogicalOpOnExec(const MachineInstr &MI)
If MI is a logical operation on an exec value, return the register copied to.
static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg)
Interface definition for SIRegisterInfo.
This file defines the SmallVector class.
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
A set of register units used to track register liveness.
Wrapper class representing physical registers. Should be passed by value.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z,...
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
reverse_self_iterator getReverseIterator()
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY int getVCMPXOpFromVCMP(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
reverse_iterator rend(StringRef path)
Get reverse end iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ Or
Bitwise or logical OR of integers.
@ Xor
Bitwise or logical XOR of integers.
void initializeSIOptimizeExecMaskingPass(PassRegistry &)
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
char & SIOptimizeExecMaskingID