LLVM 20.0.0git
SILowerI1Copies.cpp
Go to the documentation of this file.
1//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
10// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA
11// form and a wave-level control flow graph.
12//
13// Before this pass, values that are semantically i1 and are defined and used
14// within the same basic block are already represented as lane masks in scalar
15// registers. However, values that cross basic blocks are always transferred
16// between basic blocks in vreg_1 virtual registers and are lowered by this
17// pass.
18//
19// The only instructions that use or define vreg_1 virtual registers are COPY,
20// PHI, and IMPLICIT_DEF.
21//
22//===----------------------------------------------------------------------===//
23
24#include "SILowerI1Copies.h"
25#include "AMDGPU.h"
28
29#define DEBUG_TYPE "si-i1-copies"
30
31using namespace llvm;
32
33static Register
35 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs);
36
37namespace {
38
39class Vreg1LoweringHelper : public PhiLoweringHelper {
40public:
41 Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
43
44private:
45 DenseSet<Register> ConstrainRegs;
46
47public:
48 void markAsLaneMask(Register DstReg) const override;
50 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
52 const MachineInstr *MI,
53 SmallVectorImpl<Incoming> &Incomings) const override;
54 void replaceDstReg(Register NewReg, Register OldReg,
55 MachineBasicBlock *MBB) override;
58 Register DstReg, Register PrevReg,
59 Register CurReg) override;
60 void constrainAsLaneMask(Incoming &In) override;
61
62 bool lowerCopiesFromI1();
63 bool lowerCopiesToI1();
64 bool cleanConstrainRegs(bool Changed);
65 bool isVreg1(Register Reg) const {
66 return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
67 }
68};
69
70Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
73 : PhiLoweringHelper(MF, DT, PDT) {}
74
75bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
76 assert(Changed || ConstrainRegs.empty());
77 for (Register Reg : ConstrainRegs)
78 MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
79 ConstrainRegs.clear();
80
81 return Changed;
82}
83
84/// Helper class that determines the relationship between incoming values of a
85/// phi in the control flow graph to determine where an incoming value can
86/// simply be taken as a scalar lane mask as-is, and where it needs to be
87/// merged with another, previously defined lane mask.
88///
89/// The approach is as follows:
90/// - Determine all basic blocks which, starting from the incoming blocks,
91/// a wave may reach before entering the def block (the block containing the
92/// phi).
93/// - If an incoming block has no predecessors in this set, we can take the
94/// incoming value as a scalar lane mask as-is.
95/// -- A special case of this is when the def block has a self-loop.
96/// - Otherwise, the incoming value needs to be merged with a previously
97/// defined lane mask.
98/// - If there is a path into the set of reachable blocks that does _not_ go
99/// through an incoming block where we can take the scalar lane mask as-is,
100/// we need to invent an available value for the SSAUpdater. Choices are
101/// 0 and undef, with differing consequences for how to merge values etc.
102///
103/// TODO: We could use region analysis to quickly skip over SESE regions during
104/// the traversal.
105///
106class PhiIncomingAnalysis {
108 const SIInstrInfo *TII;
109
110 // For each reachable basic block, whether it is a source in the induced
111 // subgraph of the CFG.
116
117public:
118 PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII)
119 : PDT(PDT), TII(TII) {}
120
121 /// Returns whether \p MBB is a source in the induced subgraph of reachable
122 /// blocks.
123 bool isSource(MachineBasicBlock &MBB) const {
124 return ReachableMap.find(&MBB)->second;
125 }
126
127 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
128
129 void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) {
130 assert(Stack.empty());
131 ReachableMap.clear();
132 ReachableOrdered.clear();
133 Predecessors.clear();
134
135 // Insert the def block first, so that it acts as an end point for the
136 // traversal.
137 ReachableMap.try_emplace(&DefBlock, false);
138 ReachableOrdered.push_back(&DefBlock);
139
140 for (auto Incoming : Incomings) {
142 if (MBB == &DefBlock) {
143 ReachableMap[&DefBlock] = true; // self-loop on DefBlock
144 continue;
145 }
146
147 ReachableMap.try_emplace(MBB, false);
148 ReachableOrdered.push_back(MBB);
149
150 // If this block has a divergent terminator and the def block is its
151 // post-dominator, the wave may first visit the other successors.
152 if (TII->hasDivergentBranch(MBB) && PDT.dominates(&DefBlock, MBB))
153 append_range(Stack, MBB->successors());
154 }
155
156 while (!Stack.empty()) {
157 MachineBasicBlock *MBB = Stack.pop_back_val();
158 if (!ReachableMap.try_emplace(MBB, false).second)
159 continue;
160 ReachableOrdered.push_back(MBB);
161
162 append_range(Stack, MBB->successors());
163 }
164
165 for (MachineBasicBlock *MBB : ReachableOrdered) {
166 bool HaveReachablePred = false;
167 for (MachineBasicBlock *Pred : MBB->predecessors()) {
168 if (ReachableMap.count(Pred)) {
169 HaveReachablePred = true;
170 } else {
171 Stack.push_back(Pred);
172 }
173 }
174 if (!HaveReachablePred)
175 ReachableMap[MBB] = true;
176 if (HaveReachablePred) {
177 for (MachineBasicBlock *UnreachablePred : Stack) {
178 if (!llvm::is_contained(Predecessors, UnreachablePred))
179 Predecessors.push_back(UnreachablePred);
180 }
181 }
182 Stack.clear();
183 }
184 }
185};
186
187/// Helper class that detects loops which require us to lower an i1 COPY into
188/// bitwise manipulation.
189///
190/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
191/// between loops with the same header. Consider this example:
192///
193/// A-+-+
194/// | | |
195/// B-+ |
196/// | |
197/// C---+
198///
199/// A is the header of a loop containing A, B, and C as far as LoopInfo is
200/// concerned. However, an i1 COPY in B that is used in C must be lowered to
201/// bitwise operations to combine results from different loop iterations when
202/// B has a divergent branch (since by default we will compile this code such
203/// that threads in a wave are merged at the entry of C).
204///
205/// The following rule is implemented to determine whether bitwise operations
206/// are required: use the bitwise lowering for a def in block B if a backward
207/// edge to B is reachable without going through the nearest common
208/// post-dominator of B and all uses of the def.
209///
210/// TODO: This rule is conservative because it does not check whether the
211/// relevant branches are actually divergent.
212///
213/// The class is designed to cache the CFG traversal so that it can be re-used
214/// for multiple defs within the same basic block.
215///
216/// TODO: We could use region analysis to quickly skip over SESE regions during
217/// the traversal.
218///
219class LoopFinder {
222
223 // All visited / reachable block, tagged by level (level 0 is the def block,
224 // level 1 are all blocks reachable including but not going through the def
225 // block's IPDOM, etc.).
227
228 // Nearest common dominator of all visited blocks by level (level 0 is the
229 // def block). Used for seeding the SSAUpdater.
231
232 // Post-dominator of all visited blocks.
233 MachineBasicBlock *VisitedPostDom = nullptr;
234
235 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
236 // reachable without going through the IPDOM of the def block (if the IPDOM
237 // itself has an edge to the def block, the loop level is 2), etc.
238 unsigned FoundLoopLevel = ~0u;
239
240 MachineBasicBlock *DefBlock = nullptr;
243
244public:
246 : DT(DT), PDT(PDT) {}
247
249 Visited.clear();
250 CommonDominators.clear();
251 Stack.clear();
252 NextLevel.clear();
253 VisitedPostDom = nullptr;
254 FoundLoopLevel = ~0u;
255
256 DefBlock = &MBB;
257 }
258
259 /// Check whether a backward edge can be reached without going through the
260 /// given \p PostDom of the def block.
261 ///
262 /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
263 unsigned findLoop(MachineBasicBlock *PostDom) {
264 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
265
266 if (!VisitedPostDom)
267 advanceLevel();
268
269 unsigned Level = 0;
270 while (PDNode->getBlock() != PostDom) {
271 if (PDNode->getBlock() == VisitedPostDom)
272 advanceLevel();
273 PDNode = PDNode->getIDom();
274 Level++;
275 if (FoundLoopLevel == Level)
276 return Level;
277 }
278
279 return 0;
280 }
281
282 /// Add undef values dominating the loop and the optionally given additional
283 /// blocks, so that the SSA updater doesn't have to search all the way to the
284 /// function entry.
285 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
287 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs,
288 ArrayRef<Incoming> Incomings = {}) {
289 assert(LoopLevel < CommonDominators.size());
290
291 MachineBasicBlock *Dom = CommonDominators[LoopLevel];
292 for (auto &Incoming : Incomings)
294
295 if (!inLoopLevel(*Dom, LoopLevel, Incomings)) {
297 Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs));
298 } else {
299 // The dominator is part of the loop or the given blocks, so add the
300 // undef value to unreachable predecessors instead.
301 for (MachineBasicBlock *Pred : Dom->predecessors()) {
302 if (!inLoopLevel(*Pred, LoopLevel, Incomings))
304 Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs));
305 }
306 }
307 }
308
309private:
310 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
311 ArrayRef<Incoming> Incomings) const {
312 auto DomIt = Visited.find(&MBB);
313 if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
314 return true;
315
316 for (auto &Incoming : Incomings)
317 if (Incoming.Block == &MBB)
318 return true;
319
320 return false;
321 }
322
323 void advanceLevel() {
324 MachineBasicBlock *VisitedDom;
325
326 if (!VisitedPostDom) {
327 VisitedPostDom = DefBlock;
328 VisitedDom = DefBlock;
329 Stack.push_back(DefBlock);
330 } else {
331 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
332 VisitedDom = CommonDominators.back();
333
334 for (unsigned i = 0; i < NextLevel.size();) {
335 if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
336 Stack.push_back(NextLevel[i]);
337
338 NextLevel[i] = NextLevel.back();
339 NextLevel.pop_back();
340 } else {
341 i++;
342 }
343 }
344 }
345
346 unsigned Level = CommonDominators.size();
347 while (!Stack.empty()) {
348 MachineBasicBlock *MBB = Stack.pop_back_val();
349 if (!PDT.dominates(VisitedPostDom, MBB))
350 NextLevel.push_back(MBB);
351
352 Visited[MBB] = Level;
353 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
354
355 for (MachineBasicBlock *Succ : MBB->successors()) {
356 if (Succ == DefBlock) {
357 if (MBB == VisitedPostDom)
358 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
359 else
360 FoundLoopLevel = std::min(FoundLoopLevel, Level);
361 continue;
362 }
363
364 if (Visited.try_emplace(Succ, ~0u).second) {
365 if (MBB == VisitedPostDom)
366 NextLevel.push_back(Succ);
367 else
368 Stack.push_back(Succ);
369 }
370 }
371 }
372
373 CommonDominators.push_back(VisitedDom);
374 }
375};
376
377} // End anonymous namespace.
378
381 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
382 return MRI->createVirtualRegister(LaneMaskRegAttrs);
383}
384
385static Register
387 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
388 MachineFunction &MF = *MBB->getParent();
389 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
390 const SIInstrInfo *TII = ST.getInstrInfo();
391 Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
392 BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
393 UndefReg);
394 return UndefReg;
395}
396
397#ifndef NDEBUG
400 Register Reg) {
401 unsigned Size = TRI.getRegSizeInBits(Reg, MRI);
402 return Size == 1 || Size == 32;
403}
404#endif
405
406bool Vreg1LoweringHelper::lowerCopiesFromI1() {
407 bool Changed = false;
409
410 for (MachineBasicBlock &MBB : *MF) {
411 for (MachineInstr &MI : MBB) {
412 if (MI.getOpcode() != AMDGPU::COPY)
413 continue;
414
415 Register DstReg = MI.getOperand(0).getReg();
416 Register SrcReg = MI.getOperand(1).getReg();
417 if (!isVreg1(SrcReg))
418 continue;
419
420 if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
421 continue;
422
423 Changed = true;
424
425 // Copy into a 32-bit vector register.
426 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
427 DebugLoc DL = MI.getDebugLoc();
428
429 assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg));
430 assert(!MI.getOperand(0).getSubReg());
431
432 ConstrainRegs.insert(SrcReg);
433 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
434 .addImm(0)
435 .addImm(0)
436 .addImm(0)
437 .addImm(-1)
438 .addReg(SrcReg);
439 DeadCopies.push_back(&MI);
440 }
441
442 for (MachineInstr *MI : DeadCopies)
443 MI->eraseFromParent();
444 DeadCopies.clear();
445 }
446 return Changed;
447}
448
452 : MF(MF), DT(DT), PDT(PDT) {
453 MRI = &MF->getRegInfo();
454
456 TII = ST->getInstrInfo();
457 IsWave32 = ST->isWave32();
458
459 if (IsWave32) {
460 ExecReg = AMDGPU::EXEC_LO;
461 MovOp = AMDGPU::S_MOV_B32;
462 AndOp = AMDGPU::S_AND_B32;
463 OrOp = AMDGPU::S_OR_B32;
464 XorOp = AMDGPU::S_XOR_B32;
465 AndN2Op = AMDGPU::S_ANDN2_B32;
466 OrN2Op = AMDGPU::S_ORN2_B32;
467 } else {
468 ExecReg = AMDGPU::EXEC;
469 MovOp = AMDGPU::S_MOV_B64;
470 AndOp = AMDGPU::S_AND_B64;
471 OrOp = AMDGPU::S_OR_B64;
472 XorOp = AMDGPU::S_XOR_B64;
473 AndN2Op = AMDGPU::S_ANDN2_B64;
474 OrN2Op = AMDGPU::S_ORN2_B64;
475 }
476}
477
480 LoopFinder LF(*DT, *PDT);
481 PhiIncomingAnalysis PIA(*PDT, TII);
483 SmallVector<Incoming, 4> Incomings;
484
485 getCandidatesForLowering(Vreg1Phis);
486 if (Vreg1Phis.empty())
487 return false;
488
490 MachineBasicBlock *PrevMBB = nullptr;
491 for (MachineInstr *MI : Vreg1Phis) {
492 MachineBasicBlock &MBB = *MI->getParent();
493 if (&MBB != PrevMBB) {
494 LF.initialize(MBB);
495 PrevMBB = &MBB;
496 }
497
498 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI);
499
500 Register DstReg = MI->getOperand(0).getReg();
501 markAsLaneMask(DstReg);
503
505
506 // Sort the incomings such that incoming values that dominate other incoming
507 // values are sorted earlier. This allows us to do some amount of on-the-fly
508 // constant folding.
509 // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block.
510 llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) {
511 return DT->getNode(LHS.Block)->getDFSNumIn() <
512 DT->getNode(RHS.Block)->getDFSNumIn();
513 });
514
515#ifndef NDEBUG
516 PhiRegisters.insert(DstReg);
517#endif
518
519 // Phis in a loop that are observed outside the loop receive a simple but
520 // conservatively correct treatment.
521 std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
522 for (MachineInstr &Use : MRI->use_instructions(DstReg))
523 DomBlocks.push_back(Use.getParent());
524
525 MachineBasicBlock *PostDomBound =
527
528 // FIXME: This fails to find irreducible cycles. If we have a def (other
529 // than a constant) in a pair of blocks that end up looping back to each
530 // other, it will be mishandle. Due to structurization this shouldn't occur
531 // in practice.
532 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
533
534 SSAUpdater.Initialize(DstReg);
535
536 if (FoundLoopLevel) {
537 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs,
538 Incomings);
539
540 for (auto &Incoming : Incomings) {
543 }
544
545 for (auto &Incoming : Incomings) {
550 }
551 } else {
552 // The phi is not observed from outside a loop. Use a more accurate
553 // lowering.
554 PIA.analyze(MBB, Incomings);
555
556 for (MachineBasicBlock *MBB : PIA.predecessors())
559
560 for (auto &Incoming : Incomings) {
562 if (PIA.isSource(IMBB)) {
565 } else {
568 }
569 }
570
571 for (auto &Incoming : Incomings) {
573 continue;
574
579 }
580 }
581
583 if (NewReg != DstReg) {
584 replaceDstReg(NewReg, DstReg, &MBB);
585 MI->eraseFromParent();
586 }
587
588 Incomings.clear();
589 }
590 return true;
591}
592
593bool Vreg1LoweringHelper::lowerCopiesToI1() {
594 bool Changed = false;
596 LoopFinder LF(*DT, *PDT);
598
599 for (MachineBasicBlock &MBB : *MF) {
600 LF.initialize(MBB);
601
602 for (MachineInstr &MI : MBB) {
603 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
604 MI.getOpcode() != AMDGPU::COPY)
605 continue;
606
607 Register DstReg = MI.getOperand(0).getReg();
608 if (!isVreg1(DstReg))
609 continue;
610
611 Changed = true;
612
613 if (MRI->use_empty(DstReg)) {
614 DeadCopies.push_back(&MI);
615 continue;
616 }
617
618 LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
619
620 markAsLaneMask(DstReg);
621 initializeLaneMaskRegisterAttributes(DstReg);
622
623 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
624 continue;
625
626 DebugLoc DL = MI.getDebugLoc();
627 Register SrcReg = MI.getOperand(1).getReg();
628 assert(!MI.getOperand(1).getSubReg());
629
630 if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
631 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
632 Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
633 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
634 .addReg(SrcReg)
635 .addImm(0);
636 MI.getOperand(1).setReg(TmpReg);
637 SrcReg = TmpReg;
638 } else {
639 // SrcReg needs to be live beyond copy.
640 MI.getOperand(1).setIsKill(false);
641 }
642
643 // Defs in a loop that are observed outside the loop must be transformed
644 // into appropriate bit manipulation.
645 std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
646 for (MachineInstr &Use : MRI->use_instructions(DstReg))
647 DomBlocks.push_back(Use.getParent());
648
649 MachineBasicBlock *PostDomBound =
650 PDT->findNearestCommonDominator(DomBlocks);
651 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
652 if (FoundLoopLevel) {
653 SSAUpdater.Initialize(DstReg);
655 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs);
656
657 buildMergeLaneMasks(MBB, MI, DL, DstReg,
659 DeadCopies.push_back(&MI);
660 }
661 }
662
663 for (MachineInstr *MI : DeadCopies)
664 MI->eraseFromParent();
665 DeadCopies.clear();
666 }
667 return Changed;
668}
669
671 const MachineInstr *MI;
672 for (;;) {
673 MI = MRI->getUniqueVRegDef(Reg);
674 if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF)
675 return true;
676
677 if (MI->getOpcode() != AMDGPU::COPY)
678 break;
679
680 Reg = MI->getOperand(1).getReg();
681 if (!Reg.isVirtual())
682 return false;
683 if (!isLaneMaskReg(Reg))
684 return false;
685 }
686
687 if (MI->getOpcode() != MovOp)
688 return false;
689
690 if (!MI->getOperand(1).isImm())
691 return false;
692
693 int64_t Imm = MI->getOperand(1).getImm();
694 if (Imm == 0) {
695 Val = false;
696 return true;
697 }
698 if (Imm == -1) {
699 Val = true;
700 return true;
701 }
702
703 return false;
704}
705
706static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
707 Def = false;
708 Use = false;
709
710 for (const MachineOperand &MO : MI.operands()) {
711 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
712 if (MO.isUse())
713 Use = true;
714 else
715 Def = true;
716 }
717 }
718}
719
720/// Return a point at the end of the given \p MBB to insert SALU instructions
721/// for lane mask calculation. Take terminators and SCC into account.
724 auto InsertionPt = MBB.getFirstTerminator();
725 bool TerminatorsUseSCC = false;
726 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
727 bool DefsSCC;
728 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
729 if (TerminatorsUseSCC || DefsSCC)
730 break;
731 }
732
733 if (!TerminatorsUseSCC)
734 return InsertionPt;
735
736 while (InsertionPt != MBB.begin()) {
737 InsertionPt--;
738
739 bool DefSCC, UseSCC;
740 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
741 if (DefSCC)
742 return InsertionPt;
743 }
744
745 // We should have at least seen an IMPLICIT_DEF or COPY
746 llvm_unreachable("SCC used by terminator but no def in block");
747}
748
749// VReg_1 -> SReg_32 or SReg_64
750void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const {
751 MRI->setRegClass(DstReg, ST->getBoolRC());
752}
753
754void Vreg1LoweringHelper::getCandidatesForLowering(
755 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
756 for (MachineBasicBlock &MBB : *MF) {
757 for (MachineInstr &MI : MBB.phis()) {
758 if (isVreg1(MI.getOperand(0).getReg()))
759 Vreg1Phis.push_back(&MI);
760 }
761 }
762}
763
764void Vreg1LoweringHelper::collectIncomingValuesFromPhi(
765 const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
766 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
767 assert(i + 1 < MI->getNumOperands());
768 Register IncomingReg = MI->getOperand(i).getReg();
769 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB();
770 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
771
772 if (IncomingDef->getOpcode() == AMDGPU::COPY) {
773 IncomingReg = IncomingDef->getOperand(1).getReg();
774 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
775 assert(!IncomingDef->getOperand(1).getSubReg());
776 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
777 continue;
778 } else {
779 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
780 }
781
782 Incomings.emplace_back(IncomingReg, IncomingMBB, Register());
783 }
784}
785
786void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
788 MRI->replaceRegWith(NewReg, OldReg);
789}
790
791void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
793 const DebugLoc &DL,
794 Register DstReg, Register PrevReg,
795 Register CurReg) {
796 bool PrevVal = false;
797 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
798 bool CurVal = false;
799 bool CurConstant = isConstantLaneMask(CurReg, CurVal);
800
801 if (PrevConstant && CurConstant) {
802 if (PrevVal == CurVal) {
803 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
804 } else if (CurVal) {
805 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg);
806 } else {
807 BuildMI(MBB, I, DL, TII->get(XorOp), DstReg)
808 .addReg(ExecReg)
809 .addImm(-1);
810 }
811 return;
812 }
813
814 Register PrevMaskedReg;
815 Register CurMaskedReg;
816 if (!PrevConstant) {
817 if (CurConstant && CurVal) {
818 PrevMaskedReg = PrevReg;
819 } else {
820 PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
821 BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
822 .addReg(PrevReg)
823 .addReg(ExecReg);
824 }
825 }
826 if (!CurConstant) {
827 // TODO: check whether CurReg is already masked by EXEC
828 if (PrevConstant && PrevVal) {
829 CurMaskedReg = CurReg;
830 } else {
831 CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
832 BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
833 .addReg(CurReg)
834 .addReg(ExecReg);
835 }
836 }
837
838 if (PrevConstant && !PrevVal) {
839 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
840 .addReg(CurMaskedReg);
841 } else if (CurConstant && !CurVal) {
842 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
843 .addReg(PrevMaskedReg);
844 } else if (PrevConstant && PrevVal) {
845 BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg)
846 .addReg(CurMaskedReg)
847 .addReg(ExecReg);
848 } else {
849 BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
850 .addReg(PrevMaskedReg)
851 .addReg(CurMaskedReg ? CurMaskedReg : ExecReg);
852 }
853}
854
855void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {}
856
857/// Lower all instructions that def or use vreg_1 registers.
858///
859/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
860/// occur around inline assembly. We do this first, before vreg_1 registers
861/// are changed to scalar mask registers.
862///
863/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
864/// all others, because phi lowering looks through copies and can therefore
865/// often make copy lowering unnecessary.
868 // Only need to run this in SelectionDAG path.
869 if (MF.getProperties().hasProperty(
871 return false;
872
873 Vreg1LoweringHelper Helper(&MF, &MDT, &MPDT);
874 bool Changed = false;
875 Changed |= Helper.lowerCopiesFromI1();
876 Changed |= Helper.lowerPhis();
877 Changed |= Helper.lowerCopiesToI1();
878 return Helper.cleanConstrainRegs(Changed);
879}
880
887 bool Changed = runFixI1Copies(MF, MDT, MPDT);
888 if (!Changed)
889 return PreservedAnalyses::all();
890
891 // TODO: Probably preserves most.
894 return PA;
895}
896
898public:
899 static char ID;
900
903 }
904
905 bool runOnMachineFunction(MachineFunction &MF) override;
906
907 StringRef getPassName() const override { return "SI Lower i1 Copies"; }
908
909 void getAnalysisUsage(AnalysisUsage &AU) const override {
910 AU.setPreservesCFG();
914 }
915};
916
919 getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
921 getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
922 return runFixI1Copies(MF, MDT, MPDT);
923}
924
926 false, false)
931
932char SILowerI1CopiesLegacy::ID = 0;
933
935
937 return new SILowerI1CopiesLegacy();
938}
unsigned const MachineRegisterInfo * MRI
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:57
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use)
static Register insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
SI Lower i1 Copies
static bool runFixI1Copies(MachineFunction &MF, MachineDominatorTree &MDT, MachinePostDominatorTree &MPDT)
Lower all instructions that def or use vreg_1 registers.
static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, Register Reg)
Interface definition of the PhiLoweringHelper class that implements lane mask merging algorithm for d...
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
Value * RHS
Value * LHS
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:256
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
A debug info location.
Definition: DebugLoc.h:33
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
Base class for the actual dominator tree node.
DomTreeNodeBase * getIDom() const
NodeT * getBlock() const
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
Definition: Dominators.cpp:344
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:310
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:279
bool isWave32() const
iterator_range< iterator > phis()
Returns a range that iterates over the phis in the basic block.
void push_back(MachineInstr *MI)
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
bool hasProperty(Property P) const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const MachineFunctionProperties & getProperties() const
Get the function properties.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:575
bool isPHI() const
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:585
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
MachineBasicBlock * findNearestCommonDominator(ArrayRef< MachineBasicBlock * > Blocks) const
Returns the nearest common dominator of the given blocks.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
iterator_range< use_instr_iterator > use_instructions(Register Reg) const
MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
MachineSSAUpdater - This class updates SSA form for a set of virtual registers defined in multiple bl...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
PhiLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, MachinePostDominatorTree *PDT)
bool isLaneMaskReg(Register Reg) const
MachineRegisterInfo * MRI
MachineDominatorTree * DT
DenseSet< Register > PhiRegisters
virtual void getCandidatesForLowering(SmallVectorImpl< MachineInstr * > &Vreg1Phis) const =0
MachineFunction * MF
virtual void constrainAsLaneMask(Incoming &In)=0
virtual void collectIncomingValuesFromPhi(const MachineInstr *MI, SmallVectorImpl< Incoming > &Incomings) const =0
virtual void markAsLaneMask(Register DstReg) const =0
MachinePostDominatorTree * PDT
const GCNSubtarget * ST
const SIInstrInfo * TII
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs
MachineBasicBlock::iterator getSaluInsertionAtEnd(MachineBasicBlock &MBB) const
Return a point at the end of the given MBB to insert SALU instructions for lane mask calculation.
void initializeLaneMaskRegisterAttributes(Register LaneMask)
bool isConstantLaneMask(Register Reg, bool &Val) const
virtual void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg)=0
virtual void replaceDstReg(Register NewReg, Register OldReg, MachineBasicBlock *MBB)=0
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition: SSAUpdater.h:40
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Definition: SSAUpdater.cpp:52
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
Definition: SSAUpdater.cpp:97
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
Definition: SSAUpdater.cpp:69
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2115
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
Register createLaneMaskReg(MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
void initializeSILowerI1CopiesLegacyPass(PassRegistry &)
FunctionPass * createSILowerI1CopiesLegacyPass()
char & SILowerI1CopiesLegacyID
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
MachineBasicBlock * Block
Register UpdatedReg
All attributes(register class or bank and low-level type) a virtual register can have.