29 #define DEBUG_TYPE "r600cf"
38 FIRST_NON_WQM_PUSH = 2,
39 FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
43 std::vector<StackItem> BranchStack;
44 std::vector<StackItem> LoopStack;
45 unsigned MaxStackSize;
46 unsigned CurrentEntries;
47 unsigned CurrentSubEntries;
52 CurrentEntries(0), CurrentSubEntries(0) { }
54 unsigned getLoopDepth();
55 bool branchStackContains(CFStack::StackItem);
56 bool requiresWorkAroundForInst(
unsigned Opcode);
57 unsigned getSubEntrySize(CFStack::StackItem Item);
58 void updateMaxStackSize();
59 void pushBranch(
unsigned Opcode,
bool isWQM =
false);
65 unsigned CFStack::getLoopDepth() {
66 return LoopStack.size();
69 bool CFStack::branchStackContains(CFStack::StackItem Item) {
70 for (std::vector<CFStack::StackItem>::const_iterator
I = BranchStack.begin(),
71 E = BranchStack.end();
I != E; ++
I) {
78 bool CFStack::requiresWorkAroundForInst(
unsigned Opcode) {
79 if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE &&
ST->hasCaymanISA() &&
83 if (!
ST->hasCFAluBug())
87 default:
return false;
88 case AMDGPU::CF_ALU_PUSH_BEFORE:
89 case AMDGPU::CF_ALU_ELSE_AFTER:
90 case AMDGPU::CF_ALU_BREAK:
91 case AMDGPU::CF_ALU_CONTINUE:
92 if (CurrentSubEntries == 0)
94 if (
ST->getWavefrontSize() == 64) {
103 return CurrentSubEntries > 3;
105 assert(
ST->getWavefrontSize() == 32);
111 return CurrentSubEntries > 7;
116 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
120 case CFStack::FIRST_NON_WQM_PUSH:
121 assert(!
ST->hasCaymanISA());
134 case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
139 case CFStack::SUB_ENTRY:
144 void CFStack::updateMaxStackSize() {
145 unsigned CurrentStackSize = CurrentEntries +
147 MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
150 void CFStack::pushBranch(
unsigned Opcode,
bool isWQM) {
153 case AMDGPU::CF_PUSH_EG:
154 case AMDGPU::CF_ALU_PUSH_BEFORE:
156 if (!
ST->hasCaymanISA() &&
157 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
158 Item = CFStack::FIRST_NON_WQM_PUSH;
161 else if (CurrentEntries > 0 &&
163 !
ST->hasCaymanISA() &&
164 !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
165 Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
167 Item = CFStack::SUB_ENTRY;
172 BranchStack.push_back(Item);
176 CurrentSubEntries += getSubEntrySize(Item);
177 updateMaxStackSize();
180 void CFStack::pushLoop() {
183 updateMaxStackSize();
186 void CFStack::popBranch() {
187 CFStack::StackItem Top = BranchStack.back();
191 CurrentSubEntries-= getSubEntrySize(Top);
192 BranchStack.pop_back();
195 void CFStack::popLoop() {
197 LoopStack.pop_back();
203 typedef std::pair<MachineInstr *, std::vector<MachineInstr *> > ClauseFile;
205 enum ControlFlowInstruction {
222 unsigned MaxFetchInst;
235 const MCInstrDesc &getHWInstrDesc(ControlFlowInstruction CFI)
const {
240 Opcode = isEg ? AMDGPU::CF_TC_EG : AMDGPU::CF_TC_R600;
243 Opcode = isEg ? AMDGPU::CF_VC_EG : AMDGPU::CF_VC_R600;
246 Opcode = isEg ? AMDGPU::CF_CALL_FS_EG : AMDGPU::CF_CALL_FS_R600;
249 Opcode = isEg ? AMDGPU::WHILE_LOOP_EG : AMDGPU::WHILE_LOOP_R600;
252 Opcode = isEg ? AMDGPU::END_LOOP_EG : AMDGPU::END_LOOP_R600;
255 Opcode = isEg ? AMDGPU::LOOP_BREAK_EG : AMDGPU::LOOP_BREAK_R600;
257 case CF_LOOP_CONTINUE:
258 Opcode = isEg ? AMDGPU::CF_CONTINUE_EG : AMDGPU::CF_CONTINUE_R600;
261 Opcode = isEg ? AMDGPU::CF_JUMP_EG : AMDGPU::CF_JUMP_R600;
264 Opcode = isEg ? AMDGPU::CF_ELSE_EG : AMDGPU::CF_ELSE_R600;
267 Opcode = isEg ? AMDGPU::POP_EG : AMDGPU::POP_R600;
270 if (
ST->hasCaymanISA()) {
271 Opcode = AMDGPU::CF_END_CM;
274 Opcode = isEg ? AMDGPU::CF_END_EG : AMDGPU::CF_END_R600;
277 assert (Opcode &&
"No opcode selected");
278 return TII->get(Opcode);
282 std::set<unsigned> &DstRegs)
const {
283 unsigned DstMI, SrcMI;
291 if (AMDGPU::R600_Reg128RegClass.
contains(Reg))
294 DstMI = TRI->getMatchingSuperReg(Reg,
295 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
296 &AMDGPU::R600_Reg128RegClass);
300 if (AMDGPU::R600_Reg128RegClass.
contains(Reg))
303 SrcMI = TRI->getMatchingSuperReg(Reg,
304 TRI->getSubRegFromChannel(TRI->getHWRegChan(Reg)),
305 &AMDGPU::R600_Reg128RegClass);
308 if ((DstRegs.find(SrcMI) == DstRegs.end())) {
309 DstRegs.insert(DstMI);
319 std::vector<MachineInstr *> ClauseContent;
320 unsigned AluInstCount = 0;
321 bool IsTex =
TII->usesTextureCache(ClauseHead);
322 std::set<unsigned> DstRegs;
324 if (IsTrivialInst(I))
326 if (AluInstCount >= MaxFetchInst)
328 if ((IsTex && !
TII->usesTextureCache(I)) ||
329 (!IsTex && !
TII->usesVertexCache(I)))
331 if (!isCompatibleWithClause(I, DstRegs))
334 ClauseContent.push_back(I);
337 getHWInstrDesc(IsTex?CF_TC:CF_VC))
339 .
addImm(AluInstCount - 1);
340 return ClauseFile(MIb, std::move(ClauseContent));
343 void getLiteral(
MachineInstr *MI, std::vector<int64_t> &Lits)
const {
344 static const unsigned LiteralRegs[] = {
345 AMDGPU::ALU_LITERAL_X,
346 AMDGPU::ALU_LITERAL_Y,
347 AMDGPU::ALU_LITERAL_Z,
348 AMDGPU::ALU_LITERAL_W
352 for (
unsigned i = 0, e = Srcs.
size(); i < e; ++i) {
353 if (Srcs[i].first->getReg() != AMDGPU::ALU_LITERAL_X)
355 int64_t Imm = Srcs[i].second;
356 std::vector<int64_t>::iterator It =
357 std::find(Lits.begin(), Lits.end(), Imm);
358 if (It != Lits.end()) {
359 unsigned Index = It - Lits.
begin();
360 Srcs[i].first->setReg(LiteralRegs[Index]);
362 assert(Lits.size() < 4 &&
"Too many literals in Instruction Group");
363 Srcs[i].first->setReg(LiteralRegs[Lits.size()]);
371 const std::vector<unsigned> &Literals)
const {
373 for (
unsigned i = 0, e = Literals.size(); i < e; i+=2) {
374 unsigned LiteralPair0 = Literals[i];
375 unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
376 InsertPos =
BuildMI(MBB, InsertPos->getDebugLoc(),
377 TII->get(AMDGPU::LITERALS))
378 .addImm(LiteralPair0)
388 std::vector<MachineInstr *> ClauseContent;
391 if (IsTrivialInst(I)) {
395 if (!I->isBundle() && !
TII->isALUInstr(I->getOpcode()))
397 std::vector<int64_t> Literals;
401 while (++BI != E && BI->isBundledWithPred()) {
402 BI->unbundleFromPred();
403 for (
unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
408 getLiteral(BI, Literals);
409 ClauseContent.push_back(BI);
414 getLiteral(I, Literals);
415 ClauseContent.push_back(I);
418 for (
unsigned i = 0, e = Literals.size(); i < e; i+=2) {
419 unsigned literal0 = Literals[i];
420 unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
422 TII->get(AMDGPU::LITERALS))
425 ClauseContent.push_back(MILit);
428 assert(ClauseContent.size() < 128 &&
"ALU clause is too big");
430 return ClauseFile(ClauseHead, std::move(ClauseContent));
436 CounterPropagateAddr(Clause.first, CfCount);
438 BuildMI(BB, InsertPos->getDebugLoc(),
TII->get(AMDGPU::FETCH_CLAUSE))
440 for (
unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
441 BB->
splice(InsertPos, BB, Clause.second[i]);
443 CfCount += 2 * Clause.second.size();
449 Clause.first->getOperand(0).setImm(0);
450 CounterPropagateAddr(Clause.first, CfCount);
452 BuildMI(BB, InsertPos->getDebugLoc(),
TII->get(AMDGPU::ALU_CLAUSE))
454 for (
unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
455 BB->
splice(InsertPos, BB, Clause.second[i]);
457 CfCount += Clause.second.size();
460 void CounterPropagateAddr(
MachineInstr *MI,
unsigned Addr)
const {
463 void CounterPropagateAddr(
const std::set<MachineInstr *> &MIs,
464 unsigned Addr)
const {
466 CounterPropagateAddr(MI, Addr);
476 MaxFetchInst =
ST->getTexVTXClauseSize();
485 unsigned CfCount = 0;
486 std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
487 std::vector<MachineInstr * > IfThenElseStack;
490 getHWInstrDesc(CF_CALL_FS));
493 std::vector<ClauseFile> FetchClauses, AluClauses;
494 std::vector<MachineInstr *> LastAlu(1);
495 std::vector<MachineInstr *> ToPopAfter;
499 if (
TII->usesTextureCache(I) ||
TII->usesVertexCache(I)) {
500 DEBUG(
dbgs() << CfCount <<
":"; I->dump(););
501 FetchClauses.push_back(MakeFetchClause(MBB, I));
503 LastAlu.back() =
nullptr;
508 if (MI->getOpcode() != AMDGPU::ENDIF)
509 LastAlu.back() =
nullptr;
510 if (MI->getOpcode() == AMDGPU::CF_ALU)
513 bool RequiresWorkAround =
514 CFStack.requiresWorkAroundForInst(MI->getOpcode());
515 switch (MI->getOpcode()) {
516 case AMDGPU::CF_ALU_PUSH_BEFORE:
517 if (RequiresWorkAround) {
518 DEBUG(
dbgs() <<
"Applying bug work-around for ALU_PUSH_BEFORE\n");
524 CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
526 CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
530 AluClauses.push_back(MakeALUClause(MBB, I));
531 DEBUG(
dbgs() << CfCount <<
":"; MI->dump(););
534 case AMDGPU::WHILELOOP: {
537 getHWInstrDesc(CF_WHILE_LOOP))
539 std::pair<unsigned, std::set<MachineInstr *> > Pair(CfCount,
540 std::set<MachineInstr *>());
541 Pair.second.insert(MIb);
542 LoopStack.push_back(std::move(Pair));
543 MI->eraseFromParent();
547 case AMDGPU::ENDLOOP: {
549 std::pair<unsigned, std::set<MachineInstr *> > Pair =
550 std::move(LoopStack.back());
551 LoopStack.pop_back();
552 CounterPropagateAddr(Pair.second, CfCount);
554 .addImm(Pair.first + 1);
555 MI->eraseFromParent();
559 case AMDGPU::IF_PREDICATE_SET: {
560 LastAlu.push_back(
nullptr);
562 getHWInstrDesc(CF_JUMP))
565 IfThenElseStack.push_back(MIb);
567 MI->eraseFromParent();
573 IfThenElseStack.pop_back();
574 CounterPropagateAddr(JumpInst, CfCount);
576 getHWInstrDesc(CF_ELSE))
580 IfThenElseStack.push_back(MIb);
581 MI->eraseFromParent();
585 case AMDGPU::ENDIF: {
587 if (LastAlu.back()) {
588 ToPopAfter.push_back(LastAlu.back());
591 getHWInstrDesc(CF_POP))
600 IfThenElseStack.pop_back();
601 CounterPropagateAddr(IfOrElseInst, CfCount);
604 MI->eraseFromParent();
607 case AMDGPU::BREAK: {
610 getHWInstrDesc(CF_LOOP_BREAK))
612 LoopStack.back().second.insert(MIb);
613 MI->eraseFromParent();
616 case AMDGPU::CONTINUE: {
618 getHWInstrDesc(CF_LOOP_CONTINUE))
620 LoopStack.back().second.insert(MIb);
621 MI->eraseFromParent();
633 for (
unsigned i = 0, e = FetchClauses.size(); i < e; i++)
634 EmitFetchClause(I, FetchClauses[i], CfCount);
635 for (
unsigned i = 0, e = AluClauses.size(); i < e; i++)
636 EmitALUClause(I, AluClauses[i], CfCount);
639 if (
TII->isExport(MI->getOpcode())) {
640 DEBUG(
dbgs() << CfCount <<
":"; MI->dump(););
646 for (
unsigned i = 0, e = ToPopAfter.size(); i < e; ++i) {
649 TII->get(AMDGPU::CF_ALU_POP_AFTER))
650 .addImm(Alu->getOperand(0).getImm())
651 .addImm(Alu->getOperand(1).getImm())
652 .addImm(Alu->getOperand(2).getImm())
653 .addImm(Alu->getOperand(3).getImm())
654 .addImm(Alu->getOperand(4).getImm())
655 .addImm(Alu->getOperand(5).getImm())
656 .addImm(Alu->getOperand(6).getImm())
657 .addImm(Alu->getOperand(7).getImm())
658 .addImm(Alu->getOperand(8).getImm());
667 const char *getPassName()
const override {
668 return "R600 Control Flow Finalizer Pass";
678 return new R600ControlFlowFinalizer(TM);
void push_back(const T &Elt)
const MachineFunction * getParent() const
getParent - Return the MachineFunction containing this basic block.
mop_iterator operands_end()
AMDGPU specific subclass of TargetSubtarget.
instr_iterator instr_end()
Interface definition for R600InstrInfo.
Describe properties that are true of each instruction in the target description file.
Interface definition for R600RegisterInfo.
Instructions::iterator instr_iterator
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
const HexagonInstrInfo * TII
unsigned getShaderType() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Reg
All possible values of the reg field in the ModR/M byte.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
const MachineInstrBuilder & addImm(int64_t Val) const
addImm - Add a new immediate operand.
#define ENTRY(ASMNAME, ENUM)
FunctionPass * createR600ControlFlowFinalizer(TargetMachine &tm)
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
bundle_iterator< MachineInstr, instr_iterator > iterator
DebugLoc findDebugLoc(instr_iterator MBBI)
findDebugLoc - find the next valid DebugLoc starting at MBBI, skipping any DBG_VALUE instructions...
const MachineOperand & getOperand(unsigned i) const
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
void setImm(int64_t immVal)
void setIsInternalRead(bool Val=true)
FunctionPass class - This class is used to implement most global optimizations.
MachineInstrBuilder BuildMI(MachineFunction &MF, DebugLoc DL, const MCInstrDesc &MCID)
BuildMI - Builder interface.
void setDesc(const MCInstrDesc &tid)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one...
MachineOperand class - Representation of each machine instruction operand.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align)
Returns the next integer (mod 2**64) that is greater than or equal to Value and is a multiple of Alig...
KILL - This instruction is a noop that is used only to adjust the liveness of registers.
Representation of each machine instruction.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
unsigned getReg() const
getReg - Returns the register number.
mop_iterator operands_begin()
BasicBlockListType::iterator iterator
Primary interface to the complete machine description for the target machine.
bool isInternalRead() const