29 #define DEBUG_TYPE "r600cf"
38 FIRST_NON_WQM_PUSH = 2,
39 FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
43 std::vector<StackItem> BranchStack;
44 std::vector<StackItem> LoopStack;
45 unsigned MaxStackSize;
46 unsigned CurrentEntries;
47 unsigned CurrentSubEntries;
52 CurrentEntries(0), CurrentSubEntries(0) { }
54 unsigned getLoopDepth();
55 bool branchStackContains(CFStack::StackItem);
56 bool requiresWorkAroundForInst(
unsigned Opcode);
57 unsigned getSubEntrySize(CFStack::StackItem Item);
58 void updateMaxStackSize();
59 void pushBranch(
unsigned Opcode,
bool isWQM =
false);
65 unsigned CFStack::getLoopDepth() {
66 return LoopStack.size();
69 bool CFStack::branchStackContains(CFStack::StackItem Item) {
70 for (std::vector<CFStack::StackItem>::const_iterator
I = BranchStack.begin(),
71 E = BranchStack.end();
I !=
E; ++
I) {
78 bool CFStack::requiresWorkAroundForInst(
unsigned Opcode) {
79 if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE &&
ST->hasCaymanISA() &&
83 if (!
ST->hasCFAluBug())
87 default:
return false;
88 case AMDGPU::CF_ALU_PUSH_BEFORE:
89 case AMDGPU::CF_ALU_ELSE_AFTER:
90 case AMDGPU::CF_ALU_BREAK:
91 case AMDGPU::CF_ALU_CONTINUE:
92 if (CurrentSubEntries == 0)
94 if (
ST->getWavefrontSize() == 64) {
103 return CurrentSubEntries > 3;
105 assert(
ST->getWavefrontSize() == 32);
111 return CurrentSubEntries > 7;
116 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
120 case CFStack::FIRST_NON_WQM_PUSH:
134 case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
139 case CFStack::SUB_ENTRY:
144 void CFStack::updateMaxStackSize() {
145 unsigned CurrentStackSize =
146 CurrentEntries + (
alignTo(CurrentSubEntries, 4) / 4);
147 MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
150 void CFStack::pushBranch(
unsigned Opcode,
bool isWQM) {
153 case AMDGPU::CF_PUSH_EG:
154 case AMDGPU::CF_ALU_PUSH_BEFORE:
156 if (!
ST->hasCaymanISA() &&
157 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
158 Item = CFStack::FIRST_NON_WQM_PUSH;
161 else if (CurrentEntries > 0 &&
163 !
ST->hasCaymanISA() &&
164 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
165 Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
167 Item = CFStack::SUB_ENTRY;
172 BranchStack.push_back(Item);
176 CurrentSubEntries += getSubEntrySize(Item);
177 updateMaxStackSize();
180 void CFStack::pushLoop() {
183 updateMaxStackSize();
186 void CFStack::popBranch() {
187 CFStack::StackItem Top = BranchStack.back();
191 CurrentSubEntries-= getSubEntrySize(Top);
192 BranchStack.pop_back();
195 void CFStack::popLoop() {
197 LoopStack.pop_back();
203 typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
205 enum ControlFlowInstruction {
222 unsigned MaxFetchInst;
235 const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI)
const {
240 Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
243 Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
246 Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
249 Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
252 Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
255 Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
257 case CF_LOOP_CONTINUE:
258 Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
261 Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
264 Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
267 Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
270 if (
ST->hasCaymanISA()) {
271 Opcode = AMDGPU::CF_END_CM;
274 Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
277 assert (Opcode &&
"No opcode selected");
278 return TII->get(Opcode);
282 std::set<unsigned> &DstRegs)
const {
283 unsigned DstMI, SrcMI;
292 if (AMDGPU::R600_Reg128RegClass.
contains(Reg))
295 DstMI = TRI->getMatchingSuperReg(Reg,
296 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
297 &AMDGPU::R600_Reg128RegClass);
301 if (AMDGPU::R600_Reg128RegClass.
contains(Reg))
304 SrcMI = TRI->getMatchingSuperReg(Reg,
305 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
306 &AMDGPU::R600_Reg128RegClass);
309 if ((DstRegs.find(SrcMI) == DstRegs.end())) {
310 DstRegs.insert(DstMI);
320 std::vector<MachineInstr *> ClauseContent;
321 unsigned AluInstCount = 0;
322 bool IsTex =
TII->usesTextureCache(*ClauseHead);
323 std::set<unsigned> DstRegs;
325 if (IsTrivialInst(*I))
327 if (AluInstCount >= MaxFetchInst)
329 if ((IsTex && !
TII->usesTextureCache(*I)) ||
330 (!IsTex && !
TII->usesVertexCache(*I)))
332 if (!isCompatibleWithClause(*I, DstRegs))
335 ClauseContent.push_back(&*I);
338 getHWInstrDesc(IsTex?CF_TC:CF_VC))
340 .
addImm(AluInstCount - 1);
341 return ClauseFile(MIb, std::move(ClauseContent));
344 void getLiteral(
MachineInstr &MI, std::vector<MachineOperand *> &Lits)
const {
345 static const unsigned LiteralRegs[] = {
346 AMDGPU::ALU_LITERAL_X,
347 AMDGPU::ALU_LITERAL_Y,
348 AMDGPU::ALU_LITERAL_Z,
349 AMDGPU::ALU_LITERAL_W
353 for (
const auto &Src:Srcs) {
354 if (Src.first->getReg() != AMDGPU::ALU_LITERAL_X)
356 int64_t Imm = Src.second;
357 std::vector<MachineOperand *>::iterator It =
364 TII->getOperandIdx(MI.
getOpcode(), AMDGPU::OpName::literal));
366 if (It != Lits.end()) {
368 unsigned Index = It - Lits.begin();
369 Src.first->
setReg(LiteralRegs[Index]);
372 assert(Lits.size() < 4 &&
"Too many literals in Instruction Group");
373 Src.first->setReg(LiteralRegs[Lits.size()]);
374 Lits.push_back(&Operand);
381 const std::vector<unsigned> &Literals)
const {
383 for (
unsigned i = 0, e = Literals.size();
i < e;
i+=2) {
384 unsigned LiteralPair0 = Literals[
i];
385 unsigned LiteralPair1 = (
i + 1 < e)?Literals[
i + 1]:0;
386 InsertPos =
BuildMI(MBB, InsertPos->getDebugLoc(),
387 TII->get(AMDGPU::LITERALS))
388 .addImm(LiteralPair0)
398 std::vector<MachineInstr *> ClauseContent;
401 if (IsTrivialInst(*I)) {
405 if (!I->isBundle() && !
TII->isALUInstr(I->getOpcode()))
407 std::vector<MachineOperand *>Literals;
411 while (++BI !=
E && BI->isBundledWithPred()) {
412 BI->unbundleFromPred();
414 if (MO.isReg() && MO.isInternalRead())
415 MO.setIsInternalRead(
false);
417 getLiteral(*BI, Literals);
418 ClauseContent.push_back(&*BI);
423 getLiteral(*I, Literals);
424 ClauseContent.push_back(&*I);
427 for (
unsigned i = 0, e = Literals.size();
i < e;
i += 2) {
429 TII->get(AMDGPU::LITERALS));
430 if (Literals[
i]->isImm()) {
431 MILit.
addImm(Literals[
i]->getImm());
437 if (Literals[
i + 1]->isImm()) {
438 MILit.
addImm(Literals[
i + 1]->getImm());
445 ClauseContent.push_back(MILit);
448 assert(ClauseContent.size() < 128 &&
"ALU clause is too big");
450 return ClauseFile(&ClauseHead, std::move(ClauseContent));
454 const DebugLoc &DL, ClauseFile &Clause,
456 CounterPropagateAddr(*Clause.first, CfCount);
458 BuildMI(BB, DL,
TII->get(AMDGPU::FETCH_CLAUSE)).addImm(CfCount);
459 for (
unsigned i = 0, e = Clause.second.size();
i < e; ++
i) {
460 BB->
splice(InsertPos, BB, Clause.second[
i]);
462 CfCount += 2 * Clause.second.size();
466 ClauseFile &Clause,
unsigned &CfCount) {
467 Clause.first->getOperand(0).setImm(0);
468 CounterPropagateAddr(*Clause.first, CfCount);
470 BuildMI(BB, DL,
TII->get(AMDGPU::ALU_CLAUSE)).addImm(CfCount);
471 for (
unsigned i = 0, e = Clause.second.size();
i < e; ++
i) {
472 BB->
splice(InsertPos, BB, Clause.second[
i]);
474 CfCount += Clause.second.size();
477 void CounterPropagateAddr(
MachineInstr &MI,
unsigned Addr)
const {
480 void CounterPropagateAddr(
const std::set<MachineInstr *> &MIs,
481 unsigned Addr)
const {
483 CounterPropagateAddr(*MI, Addr);
493 MaxFetchInst =
ST->getTexVTXClauseSize();
494 TII =
ST->getInstrInfo();
495 TRI =
ST->getRegisterInfo();
503 unsigned CfCount = 0;
504 std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
505 std::vector<MachineInstr * > IfThenElseStack;
508 getHWInstrDesc(CF_CALL_FS));
511 std::vector<ClauseFile> FetchClauses, AluClauses;
512 std::vector<MachineInstr *> LastAlu(1);
513 std::vector<MachineInstr *> ToPopAfter;
517 if (
TII->usesTextureCache(*I) ||
TII->usesVertexCache(*I)) {
518 DEBUG(
dbgs() << CfCount <<
":"; I->dump(););
519 FetchClauses.push_back(MakeFetchClause(MBB, I));
521 LastAlu.back() =
nullptr;
526 if (MI->getOpcode() != AMDGPU::ENDIF)
527 LastAlu.back() =
nullptr;
528 if (MI->getOpcode() == AMDGPU::CF_ALU)
529 LastAlu.back() = &*
MI;
531 bool RequiresWorkAround =
532 CFStack.requiresWorkAroundForInst(MI->getOpcode());
533 switch (MI->getOpcode()) {
534 case AMDGPU::CF_ALU_PUSH_BEFORE:
535 if (RequiresWorkAround) {
536 DEBUG(
dbgs() <<
"Applying bug work-around for ALU_PUSH_BEFORE\n");
540 MI->setDesc(
TII->get(AMDGPU::CF_ALU));
542 CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
544 CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
548 AluClauses.push_back(MakeALUClause(MBB, I));
549 DEBUG(
dbgs() << CfCount <<
":"; MI->dump(););
552 case AMDGPU::WHILELOOP: {
555 getHWInstrDesc(CF_WHILE_LOOP))
557 std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
558 std::set<MachineInstr *>());
559 Pair.second.insert(MIb);
560 LoopStack.push_back(std::move(Pair));
561 MI->eraseFromParent();
565 case AMDGPU::ENDLOOP: {
567 std::pair<unsigned, std::set<MachineInstr *> > Pair =
568 std::move(LoopStack.back());
569 LoopStack.pop_back();
570 CounterPropagateAddr(Pair.second, CfCount);
572 .addImm(Pair.first + 1);
573 MI->eraseFromParent();
577 case AMDGPU::IF_PREDICATE_SET: {
578 LastAlu.push_back(
nullptr);
580 getHWInstrDesc(CF_JUMP))
583 IfThenElseStack.push_back(MIb);
585 MI->eraseFromParent();
591 IfThenElseStack.pop_back();
592 CounterPropagateAddr(*JumpInst, CfCount);
594 getHWInstrDesc(CF_ELSE))
598 IfThenElseStack.push_back(MIb);
599 MI->eraseFromParent();
603 case AMDGPU::ENDIF: {
605 if (LastAlu.back()) {
606 ToPopAfter.push_back(LastAlu.back());
609 getHWInstrDesc(CF_POP))
618 IfThenElseStack.pop_back();
619 CounterPropagateAddr(*IfOrElseInst, CfCount);
622 MI->eraseFromParent();
625 case AMDGPU::BREAK: {
628 getHWInstrDesc(CF_LOOP_BREAK))
630 LoopStack.back().second.insert(MIb);
631 MI->eraseFromParent();
634 case AMDGPU::CONTINUE: {
636 getHWInstrDesc(CF_LOOP_CONTINUE))
638 LoopStack.back().second.insert(MIb);
639 MI->eraseFromParent();
645 BuildMI(MBB, MI, DL, getHWInstrDesc(CF_END));
651 MI->eraseFromParent();
652 for (
unsigned i = 0, e = FetchClauses.size();
i < e;
i++)
653 EmitFetchClause(I, DL, FetchClauses[
i], CfCount);
654 for (
unsigned i = 0, e = AluClauses.size(); i < e; i++)
655 EmitALUClause(I, DL, AluClauses[i], CfCount);
659 if (
TII->isExport(MI->getOpcode())) {
660 DEBUG(
dbgs() << CfCount <<
":"; MI->dump(););
666 for (
unsigned i = 0, e = ToPopAfter.size();
i < e; ++
i) {
669 TII->get(AMDGPU::CF_ALU_POP_AFTER))
670 .addImm(Alu->getOperand(0).getImm())
671 .addImm(Alu->getOperand(1).getImm())
672 .addImm(Alu->getOperand(2).getImm())
673 .addImm(Alu->getOperand(3).getImm())
674 .addImm(Alu->getOperand(4).getImm())
675 .addImm(Alu->getOperand(5).getImm())
676 .addImm(Alu->getOperand(6).getImm())
677 .addImm(Alu->getOperand(7).getImm())
678 .addImm(Alu->getOperand(8).getImm());
679 Alu->eraseFromParent();
688 return "R600 Control Flow Finalizer Pass";
698 return new R600ControlFlowFinalizer(TM);
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
mop_iterator operands_end()
AMDGPU specific subclass of TargetSubtarget.
instr_iterator instr_end()
Interface definition for R600InstrInfo.
Describe properties that are true of each instruction in the target description file.
Calling convention used for Mesa vertex shaders.
const Function * getFunction() const
getFunction - Return the LLVM function that this machine code represents
uint64_t alignTo(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
Interface definition for R600RegisterInfo.
return AArch64::GPR64RegClass contains(Reg)
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Reg
All possible values of the reg field in the ModR/M byte.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
#define ENTRY(ASMNAME, ENUM)
static Error getOffset(const SymbolRef &Sym, SectionRef Sec, uint64_t &Result)
FunctionPass * createR600ControlFlowFinalizer(TargetMachine &tm)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
static GCRegistry::Add< CoreCLRGC > E("coreclr","CoreCLR-compatible GC")
MachineInstrBuilder BuildMI(MachineFunction &MF, const DebugLoc &DL, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
const MachineOperand & getOperand(unsigned i) const
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void setImm(int64_t immVal)
FunctionPass class - This class is used to implement most global optimizations.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any DBG_VALUE instructions.
Iterator for intrusive lists based on ilist_node.
MachineOperand class - Representation of each machine instruction operand.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
instr_iterator getInstrIterator() const
void dump(const TargetInstrInfo *TII=nullptr) const
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned char TargetFlags=0) const
Representation of each machine instruction.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
void setReg(unsigned Reg)
Change the register this operand corresponds to.
unsigned getReg() const
getReg - Returns the register number.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
mop_iterator operands_begin()
Primary interface to the complete machine description for the target machine.
StringRef - Represent a constant reference to a string, i.e.
auto find_if(R &&Range, UnaryPredicate P) -> decltype(std::begin(Range))
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly...