22#define DEBUG_TYPE "si-optimize-exec-masking"
48 bool IgnoreStart =
false)
const;
55 unsigned MaxInstructions = 20)
const;
56 bool optimizeExecSequence();
58 bool optimizeVCMPSaveExecSequence(
MachineInstr &SaveExecInstr,
62 bool optimizeOrSaveexecXorSequences();
74 return "SI optimize exec mask operations";
86 "SI optimize exec mask operations",
false,
false)
91char SIOptimizeExecMasking::
ID = 0;
97 switch (
MI.getOpcode()) {
99 case AMDGPU::S_MOV_B64:
100 case AMDGPU::S_MOV_B64_term:
101 case AMDGPU::S_MOV_B32:
102 case AMDGPU::S_MOV_B32_term: {
104 if (Src.isReg() && Src.getReg() == Exec)
105 return MI.getOperand(0).getReg();
109 return AMDGPU::NoRegister;
114 switch (
MI.getOpcode()) {
116 case AMDGPU::S_MOV_B64:
117 case AMDGPU::S_MOV_B32: {
119 if (Dst.isReg() && Dst.getReg() == Exec &&
MI.getOperand(1).isReg())
120 return MI.getOperand(1).getReg();
123 case AMDGPU::S_MOV_B64_term:
124 case AMDGPU::S_MOV_B32_term:
134 switch (
MI.getOpcode()) {
135 case AMDGPU::S_AND_B64:
136 case AMDGPU::S_OR_B64:
137 case AMDGPU::S_XOR_B64:
138 case AMDGPU::S_ANDN2_B64:
139 case AMDGPU::S_ORN2_B64:
140 case AMDGPU::S_NAND_B64:
141 case AMDGPU::S_NOR_B64:
142 case AMDGPU::S_XNOR_B64: {
145 return MI.getOperand(0).getReg();
148 return MI.getOperand(0).getReg();
151 case AMDGPU::S_AND_B32:
152 case AMDGPU::S_OR_B32:
153 case AMDGPU::S_XOR_B32:
154 case AMDGPU::S_ANDN2_B32:
155 case AMDGPU::S_ORN2_B32:
156 case AMDGPU::S_NAND_B32:
157 case AMDGPU::S_NOR_B32:
158 case AMDGPU::S_XNOR_B32: {
160 if (Src1.
isReg() && Src1.
getReg() == AMDGPU::EXEC_LO)
161 return MI.getOperand(0).getReg();
163 if (Src2.
isReg() && Src2.
getReg() == AMDGPU::EXEC_LO)
164 return MI.getOperand(0).getReg();
169 return AMDGPU::NoRegister;
174 case AMDGPU::S_AND_B64:
175 return AMDGPU::S_AND_SAVEEXEC_B64;
176 case AMDGPU::S_OR_B64:
177 return AMDGPU::S_OR_SAVEEXEC_B64;
178 case AMDGPU::S_XOR_B64:
179 return AMDGPU::S_XOR_SAVEEXEC_B64;
180 case AMDGPU::S_ANDN2_B64:
181 return AMDGPU::S_ANDN2_SAVEEXEC_B64;
182 case AMDGPU::S_ORN2_B64:
183 return AMDGPU::S_ORN2_SAVEEXEC_B64;
184 case AMDGPU::S_NAND_B64:
185 return AMDGPU::S_NAND_SAVEEXEC_B64;
186 case AMDGPU::S_NOR_B64:
187 return AMDGPU::S_NOR_SAVEEXEC_B64;
188 case AMDGPU::S_XNOR_B64:
189 return AMDGPU::S_XNOR_SAVEEXEC_B64;
190 case AMDGPU::S_AND_B32:
191 return AMDGPU::S_AND_SAVEEXEC_B32;
192 case AMDGPU::S_OR_B32:
193 return AMDGPU::S_OR_SAVEEXEC_B32;
194 case AMDGPU::S_XOR_B32:
195 return AMDGPU::S_XOR_SAVEEXEC_B32;
196 case AMDGPU::S_ANDN2_B32:
197 return AMDGPU::S_ANDN2_SAVEEXEC_B32;
198 case AMDGPU::S_ORN2_B32:
199 return AMDGPU::S_ORN2_SAVEEXEC_B32;
200 case AMDGPU::S_NAND_B32:
201 return AMDGPU::S_NAND_SAVEEXEC_B32;
202 case AMDGPU::S_NOR_B32:
203 return AMDGPU::S_NOR_SAVEEXEC_B32;
204 case AMDGPU::S_XNOR_B32:
205 return AMDGPU::S_XNOR_SAVEEXEC_B32;
207 return AMDGPU::INSTRUCTION_LIST_END;
213bool SIOptimizeExecMasking::removeTerminatorBit(
MachineInstr &
MI)
const {
214 switch (
MI.getOpcode()) {
215 case AMDGPU::S_MOV_B32_term: {
216 bool RegSrc =
MI.getOperand(1).isReg();
217 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B32));
220 case AMDGPU::S_MOV_B64_term: {
221 bool RegSrc =
MI.getOperand(1).isReg();
222 MI.setDesc(
TII->get(RegSrc ? AMDGPU::COPY : AMDGPU::S_MOV_B64));
225 case AMDGPU::S_XOR_B64_term: {
228 MI.setDesc(
TII->get(AMDGPU::S_XOR_B64));
231 case AMDGPU::S_XOR_B32_term: {
234 MI.setDesc(
TII->get(AMDGPU::S_XOR_B32));
237 case AMDGPU::S_OR_B64_term: {
240 MI.setDesc(
TII->get(AMDGPU::S_OR_B64));
243 case AMDGPU::S_OR_B32_term: {
246 MI.setDesc(
TII->get(AMDGPU::S_OR_B32));
249 case AMDGPU::S_ANDN2_B64_term: {
252 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B64));
255 case AMDGPU::S_ANDN2_B32_term: {
258 MI.setDesc(
TII->get(AMDGPU::S_ANDN2_B32));
261 case AMDGPU::S_AND_B64_term: {
264 MI.setDesc(
TII->get(AMDGPU::S_AND_B64));
267 case AMDGPU::S_AND_B32_term: {
270 MI.setDesc(
TII->get(AMDGPU::S_AND_B32));
287 for (;
I !=
E; ++
I) {
288 if (!
I->isTerminator())
289 return Seen ? FirstNonTerm :
I;
291 if (removeTerminatorBit(*
I)) {
304 const unsigned InstLimit = 25;
307 for (
unsigned N = 0;
N <= InstLimit &&
I !=
E; ++
I, ++
N) {
308 Register CopyFromExec = isCopyFromExec(*
I);
321 if (Succ->isLiveIn(Reg))
337 unsigned MaxInstructions)
const {
340 unsigned CurrentIteration = 0;
342 for (++
A; CurrentIteration < MaxInstructions &&
A !=
E; ++
A) {
343 if (
A->isDebugInstr())
350 if (
A->modifiesRegister(Reg,
TRI))
357 if (Terminator && KillFlagCandidates &&
A != Terminator &&
358 A->killsRegister(Reg,
TRI)) {
360 if (MO.isReg() && MO.isKill()) {
362 if (Candidate != Reg &&
TRI->regsOverlap(Candidate, Reg))
381bool SIOptimizeExecMasking::isRegisterInUseBetween(
MachineInstr &Stop,
385 bool IgnoreStart)
const {
399 return !LR.available(*
MRI, Reg);
404bool SIOptimizeExecMasking::isRegisterInUseAfter(
MachineInstr &Stop,
406 return isRegisterInUseBetween(Stop, *Stop.
getParent()->
rbegin(), Reg,
true);
419bool SIOptimizeExecMasking::optimizeExecSequence() {
420 bool Changed =
false;
431 unsigned SearchCount = 0;
432 const unsigned SearchLimit = 5;
433 while (
I !=
E && SearchCount++ < SearchLimit) {
434 CopyToExec = isCopyToExec(*
I);
444 auto *CopyToExecInst = &*
I;
445 auto CopyFromExecInst = findExecCopy(
MBB,
I);
446 if (CopyFromExecInst ==
E) {
447 auto PrepareExecInst = std::next(
I);
448 if (PrepareExecInst ==
E)
451 if (CopyToExecInst->getOperand(1).isKill() &&
455 PrepareExecInst->getOperand(0).setReg(Exec);
459 CopyToExecInst->eraseFromParent();
472 Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
477 J = std::next(CopyFromExecInst->getIterator()),
478 JE =
I->getIterator();
480 if (SaveExecInst && J->readsRegister(Exec,
TRI)) {
481 LLVM_DEBUG(
dbgs() <<
"exec read prevents saveexec: " << *J <<
'\n');
484 SaveExecInst =
nullptr;
490 if (J->modifiesRegister(CopyToExec,
TRI)) {
494 SaveExecInst =
nullptr;
499 if (SaveExecOp == AMDGPU::INSTRUCTION_LIST_END)
502 if (ReadsCopyFromExec) {
504 LLVM_DEBUG(
dbgs() <<
"Found save exec op: " << *SaveExecInst <<
'\n');
508 <<
"Instruction does not read exec copy: " << *J <<
'\n');
511 }
else if (ReadsCopyFromExec && !SaveExecInst) {
520 LLVM_DEBUG(
dbgs() <<
"Found second use of save inst candidate: " << *J
525 if (SaveExecInst && J->readsRegister(CopyToExec,
TRI)) {
526 assert(SaveExecInst != &*J);
534 LLVM_DEBUG(
dbgs() <<
"Insert save exec op: " << *SaveExecInst <<
'\n');
541 if (Src0.
isReg() && Src0.
getReg() == CopyFromExec) {
543 }
else if (Src1.
isReg() && Src1.
getReg() == CopyFromExec) {
551 CopyFromExecInst->eraseFromParent();
561 CopyToExecInst->eraseFromParent();
564 OtherInst->substituteRegister(CopyToExec, Exec, AMDGPU::NoSubRegister,
576bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
589 if (!SaveExecInstr.
uses().empty()) {
590 bool IsSGPR32 =
TRI->getRegSizeInBits(MoveDest, *
MRI) == 32;
591 unsigned MovOpcode = IsSGPR32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
602 auto TryAddImmediateValueFromNamedOperand =
603 [&](
unsigned OperandName) ->
void {
604 if (
auto *
Mod =
TII->getNamedOperand(VCmp, OperandName))
605 Builder.addImm(
Mod->getImm());
608 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src0_modifiers);
611 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::src1_modifiers);
614 TryAddImmediateValueFromNamedOperand(AMDGPU::OpName::clamp);
623 MO->setIsKill(
false);
638void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
640 if (!
ST->hasGFX10_3Insts())
643 const unsigned AndSaveExecOpcode =
644 ST->isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
646 if (
MI.getOpcode() != AndSaveExecOpcode)
649 Register SaveExecDest =
MI.getOperand(0).getReg();
650 if (!
TRI->isSGPRReg(*
MRI, SaveExecDest))
654 if (!SaveExecSrc0->
isReg())
666 VCmp = findInstrBackwards(
678 assert(VCmpDest &&
"Should have an sdst operand!");
700 if (isRegisterInUseBetween(*VCmp,
MI, VCmpDest->
getReg(),
false,
true) ||
701 isRegisterInUseAfter(
MI, VCmpDest->
getReg()))
715 if (!findInstrBackwards(
717 VCmp, &KillFlagCandidates))
721 SaveExecVCmpMapping[&
MI] = VCmp;
729void SIOptimizeExecMasking::tryRecordOrSaveexecXorSequence(
MachineInstr &
MI) {
730 const unsigned XorOpcode =
731 ST->isWave32() ? AMDGPU::S_XOR_B32 : AMDGPU::S_XOR_B64;
733 if (
MI.getOpcode() == XorOpcode && &
MI != &
MI.getParent()->front()) {
740 (XorSrc0.
getReg() == Exec || XorSrc1.
getReg() == Exec)) {
741 const unsigned OrSaveexecOpcode =
ST->isWave32()
742 ? AMDGPU::S_OR_SAVEEXEC_B32
743 : AMDGPU::S_OR_SAVEEXEC_B64;
748 if (PossibleOrSaveexec.
getOpcode() != OrSaveexecOpcode)
756 OrXors.emplace_back(&PossibleOrSaveexec, &
MI);
763bool SIOptimizeExecMasking::optimizeOrSaveexecXorSequences() {
764 if (OrXors.empty()) {
768 bool Changed =
false;
769 const unsigned Andn2Opcode =
ST->isWave32() ? AMDGPU::S_ANDN2_SAVEEXEC_B32
770 : AMDGPU::S_ANDN2_SAVEEXEC_B64;
772 for (
const auto &Pair : OrXors) {
775 std::tie(
Or,
Xor) = Pair;
776 BuildMI(*
Or->getParent(),
Or->getIterator(),
Or->getDebugLoc(),
777 TII->get(Andn2Opcode),
Or->getOperand(0).getReg())
778 .
addReg(
Or->getOperand(1).getReg());
780 Or->eraseFromParent();
781 Xor->eraseFromParent();
795 TRI =
ST->getRegisterInfo();
796 TII =
ST->getInstrInfo();
800 bool Changed = optimizeExecSequence();
803 SaveExecVCmpMapping.clear();
804 KillFlagCandidates.clear();
805 static unsigned SearchWindow = 10;
807 unsigned SearchCount = 0;
810 if (
MI.isDebugInstr())
813 if (SearchCount >= SearchWindow) {
817 tryRecordOrSaveexecXorSequence(
MI);
818 tryRecordVCmpxAndSaveexecSequence(
MI);
820 if (
MI.modifiesRegister(Exec,
TRI)) {
828 Changed |= optimizeOrSaveexecXorSequences();
829 for (
const auto &Entry : SaveExecVCmpMapping) {
833 Changed |= optimizeVCMPSaveExecSequence(*SaveExecInstr, *VCmpInstr, Exec);
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
This file implements the LivePhysRegs utility for tracking liveness of physical registers.
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getSaveExecOp(unsigned Opc)
SI optimize exec mask operations
static Register isLogicalOpOnExec(const MachineInstr &MI)
If MI is a logical operation on an exec value, return the register copied to.
static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg)
Interface definition for SIRegisterInfo.
This file defines the SmallVector class.
Represent the analysis usage information of a pass.
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
A set of physical registers with utility functions to track liveness when walking backward/forward th...
Wrapper class representing physical registers. Should be passed by value.
Instructions::iterator instr_iterator
iterator_range< succ_iterator > successors()
reverse_iterator rbegin()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
MachineBasicBlock iterator that automatically skips over MIs that are inside bundles (i....
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
iterator_range< mop_iterator > uses()
Returns a range that includes all operands that are register uses.
bool isCommutable(QueryType Type=IgnoreBundle) const
Return true if this may be a 2- or 3-address instruction (of the form "X = op Y, Z,...
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI=nullptr) const
Return true if the MachineInstr reads the specified register.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
reverse_self_iterator getReverseIterator()
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY int getVCMPXOpFromVCMP(uint16_t Opcode)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
reverse_iterator rend(StringRef path)
Get reverse end iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ Or
Bitwise or logical OR of integers.
@ Xor
Bitwise or logical XOR of integers.
void initializeSIOptimizeExecMaskingPass(PassRegistry &)
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
char & SIOptimizeExecMaskingID