LLVM 23.0.0git
SILowerI1Copies.cpp
Go to the documentation of this file.
1//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
10// to lane masks (32 / 64-bit scalar registers). The pass assumes machine SSA
11// form and a wave-level control flow graph.
12//
13// Before this pass, values that are semantically i1 and are defined and used
14// within the same basic block are already represented as lane masks in scalar
15// registers. However, values that cross basic blocks are always transferred
16// between basic blocks in vreg_1 virtual registers and are lowered by this
17// pass.
18//
19// The only instructions that use or define vreg_1 virtual registers are COPY,
20// PHI, and IMPLICIT_DEF.
21//
22//===----------------------------------------------------------------------===//
23
24#include "SILowerI1Copies.h"
25#include "AMDGPU.h"
28
29#define DEBUG_TYPE "si-i1-copies"
30
31using namespace llvm;
32
33static Register
35 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs);
36
37namespace {
38
39class Vreg1LoweringHelper : public PhiLoweringHelper {
40public:
41 Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
43
44private:
45 DenseSet<Register> ConstrainRegs;
46
47public:
48 void markAsLaneMask(Register DstReg) const override;
49 void getCandidatesForLowering(
50 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
51 void collectIncomingValuesFromPhi(
52 const MachineInstr *MI,
53 SmallVectorImpl<Incoming> &Incomings) const override;
54 void replaceDstReg(Register NewReg, Register OldReg,
55 MachineBasicBlock *MBB) override;
56 void buildMergeLaneMasks(MachineBasicBlock &MBB,
58 Register DstReg, Register PrevReg,
59 Register CurReg) override;
60 void constrainAsLaneMask(Incoming &In) override;
61
62 bool lowerCopiesFromI1();
63 bool lowerCopiesToI1();
64 bool cleanConstrainRegs(bool Changed);
65 bool isVreg1(Register Reg) const {
66 return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
67 }
68};
69
70Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
73 : PhiLoweringHelper(MF, DT, PDT) {}
74
75bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
76 assert(Changed || ConstrainRegs.empty());
77 for (Register Reg : ConstrainRegs)
78 MRI->constrainRegClass(Reg, TII->getRegisterInfo().getWaveMaskRegClass());
79 ConstrainRegs.clear();
80
81 return Changed;
82}
83
84/// Helper class that determines the relationship between incoming values of a
85/// phi in the control flow graph to determine where an incoming value can
86/// simply be taken as a scalar lane mask as-is, and where it needs to be
87/// merged with another, previously defined lane mask.
88///
89/// The approach is as follows:
90/// - Determine all basic blocks which, starting from the incoming blocks,
91/// a wave may reach before entering the def block (the block containing the
92/// phi).
93/// - If an incoming block has no predecessors in this set, we can take the
94/// incoming value as a scalar lane mask as-is.
95/// -- A special case of this is when the def block has a self-loop.
96/// - Otherwise, the incoming value needs to be merged with a previously
97/// defined lane mask.
98/// - If there is a path into the set of reachable blocks that does _not_ go
99/// through an incoming block where we can take the scalar lane mask as-is,
100/// we need to invent an available value for the SSAUpdater. Choices are
101/// 0 and undef, with differing consequences for how to merge values etc.
102///
103/// TODO: We could use region analysis to quickly skip over SESE regions during
104/// the traversal.
105///
106class PhiIncomingAnalysis {
107 MachinePostDominatorTree &PDT;
108 const SIInstrInfo *TII;
109
110 // For each reachable basic block, whether it is a source in the induced
111 // subgraph of the CFG.
112 MapVector<MachineBasicBlock *, bool> ReachableMap;
113 SmallVector<MachineBasicBlock *, 4> Stack;
114 SmallVector<MachineBasicBlock *, 4> Predecessors;
115
116public:
117 PhiIncomingAnalysis(MachinePostDominatorTree &PDT, const SIInstrInfo *TII)
118 : PDT(PDT), TII(TII) {}
119
120 /// Returns whether \p MBB is a source in the induced subgraph of reachable
121 /// blocks.
122 bool isSource(MachineBasicBlock &MBB) const {
123 return ReachableMap.find(&MBB)->second;
124 }
125
126 ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
127
128 void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) {
129 assert(Stack.empty());
130 ReachableMap.clear();
131 Predecessors.clear();
132
133 // Insert the def block first, so that it acts as an end point for the
134 // traversal.
135 ReachableMap.try_emplace(&DefBlock, false);
136
137 for (auto Incoming : Incomings) {
138 MachineBasicBlock *MBB = Incoming.Block;
139 if (MBB == &DefBlock) {
140 ReachableMap[&DefBlock] = true; // self-loop on DefBlock
141 continue;
142 }
143
144 ReachableMap.try_emplace(MBB, false);
145
146 // If this block has a divergent terminator and the def block is its
147 // post-dominator, the wave may first visit the other successors.
148 if (TII->hasDivergentBranch(MBB) && PDT.dominates(&DefBlock, MBB))
149 append_range(Stack, MBB->successors());
150 }
151
152 while (!Stack.empty()) {
153 MachineBasicBlock *MBB = Stack.pop_back_val();
154 if (ReachableMap.try_emplace(MBB, false).second)
155 append_range(Stack, MBB->successors());
156 }
157
158 for (auto &[MBB, Reachable] : ReachableMap) {
159 bool HaveReachablePred = false;
160 for (MachineBasicBlock *Pred : MBB->predecessors()) {
161 if (ReachableMap.count(Pred)) {
162 HaveReachablePred = true;
163 } else {
164 Stack.push_back(Pred);
165 }
166 }
167 if (!HaveReachablePred)
168 Reachable = true;
169 if (HaveReachablePred) {
170 for (MachineBasicBlock *UnreachablePred : Stack) {
171 if (!llvm::is_contained(Predecessors, UnreachablePred))
172 Predecessors.push_back(UnreachablePred);
173 }
174 }
175 Stack.clear();
176 }
177 }
178};
179
180/// Helper class that detects loops which require us to lower an i1 COPY into
181/// bitwise manipulation.
182///
183/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
184/// between loops with the same header. Consider this example:
185///
186/// A-+-+
187/// | | |
188/// B-+ |
189/// | |
190/// C---+
191///
192/// A is the header of a loop containing A, B, and C as far as LoopInfo is
193/// concerned. However, an i1 COPY in B that is used in C must be lowered to
194/// bitwise operations to combine results from different loop iterations when
195/// B has a divergent branch (since by default we will compile this code such
196/// that threads in a wave are merged at the entry of C).
197///
198/// The following rule is implemented to determine whether bitwise operations
199/// are required: use the bitwise lowering for a def in block B if a backward
200/// edge to B is reachable without going through the nearest common
201/// post-dominator of B and all uses of the def.
202///
203/// TODO: This rule is conservative because it does not check whether the
204/// relevant branches are actually divergent.
205///
206/// The class is designed to cache the CFG traversal so that it can be re-used
207/// for multiple defs within the same basic block.
208///
209/// TODO: We could use region analysis to quickly skip over SESE regions during
210/// the traversal.
211///
212class LoopFinder {
213 MachineDominatorTree &DT;
214 MachinePostDominatorTree &PDT;
215
216 // All visited / reachable block, tagged by level (level 0 is the def block,
217 // level 1 are all blocks reachable including but not going through the def
218 // block's IPDOM, etc.).
219 DenseMap<MachineBasicBlock *, unsigned> Visited;
220
221 // Nearest common dominator of all visited blocks by level (level 0 is the
222 // def block). Used for seeding the SSAUpdater.
223 SmallVector<MachineBasicBlock *, 4> CommonDominators;
224
225 // Post-dominator of all visited blocks.
226 MachineBasicBlock *VisitedPostDom = nullptr;
227
228 // Level at which a loop was found: 0 is not possible; 1 = a backward edge is
229 // reachable without going through the IPDOM of the def block (if the IPDOM
230 // itself has an edge to the def block, the loop level is 2), etc.
231 unsigned FoundLoopLevel = ~0u;
232
233 MachineBasicBlock *DefBlock = nullptr;
234 SmallVector<MachineBasicBlock *, 4> Stack;
235 SmallVector<MachineBasicBlock *, 4> NextLevel;
236
237public:
238 LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
239 : DT(DT), PDT(PDT) {}
240
241 void initialize(MachineBasicBlock &MBB) {
242 Visited.clear();
243 CommonDominators.clear();
244 Stack.clear();
245 NextLevel.clear();
246 VisitedPostDom = nullptr;
247 FoundLoopLevel = ~0u;
248
249 DefBlock = &MBB;
250 }
251
252 /// Check whether a backward edge can be reached without going through the
253 /// given \p PostDom of the def block.
254 ///
255 /// Return the level of \p PostDom if a loop was found, or 0 otherwise.
256 unsigned findLoop(MachineBasicBlock *PostDom) {
257 MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);
258
259 if (!VisitedPostDom)
260 advanceLevel();
261
262 unsigned Level = 0;
263 while (PDNode->getBlock() != PostDom) {
264 if (PDNode->getBlock() == VisitedPostDom)
265 advanceLevel();
266 PDNode = PDNode->getIDom();
267 Level++;
268 if (FoundLoopLevel == Level)
269 return Level;
270 }
271
272 return 0;
273 }
274
275 /// Add undef values dominating the loop and the optionally given additional
276 /// blocks, so that the SSA updater doesn't have to search all the way to the
277 /// function entry.
278 void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
279 MachineRegisterInfo &MRI,
280 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs,
281 ArrayRef<Incoming> Incomings = {}) {
282 assert(LoopLevel < CommonDominators.size());
283
284 MachineBasicBlock *Dom = CommonDominators[LoopLevel];
285 for (auto &Incoming : Incomings)
286 Dom = DT.findNearestCommonDominator(Dom, Incoming.Block);
287
288 if (!inLoopLevel(*Dom, LoopLevel, Incomings)) {
289 SSAUpdater.AddAvailableValue(
290 Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs));
291 } else {
292 // The dominator is part of the loop or the given blocks, so add the
293 // undef value to unreachable predecessors instead.
294 for (MachineBasicBlock *Pred : Dom->predecessors()) {
295 if (!inLoopLevel(*Pred, LoopLevel, Incomings))
296 SSAUpdater.AddAvailableValue(
297 Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs));
298 }
299 }
300 }
301
302private:
303 bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
304 ArrayRef<Incoming> Incomings) const {
305 auto DomIt = Visited.find(&MBB);
306 if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
307 return true;
308
309 for (auto &Incoming : Incomings)
310 if (Incoming.Block == &MBB)
311 return true;
312
313 return false;
314 }
315
316 void advanceLevel() {
317 MachineBasicBlock *VisitedDom;
318
319 if (!VisitedPostDom) {
320 VisitedPostDom = DefBlock;
321 VisitedDom = DefBlock;
322 Stack.push_back(DefBlock);
323 } else {
324 VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
325 VisitedDom = CommonDominators.back();
326
327 for (unsigned i = 0; i < NextLevel.size();) {
328 if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
329 Stack.push_back(NextLevel[i]);
330
331 NextLevel[i] = NextLevel.back();
332 NextLevel.pop_back();
333 } else {
334 i++;
335 }
336 }
337 }
338
339 unsigned Level = CommonDominators.size();
340 while (!Stack.empty()) {
341 MachineBasicBlock *MBB = Stack.pop_back_val();
342 if (!PDT.dominates(VisitedPostDom, MBB))
343 NextLevel.push_back(MBB);
344
345 Visited[MBB] = Level;
346 VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);
347
348 for (MachineBasicBlock *Succ : MBB->successors()) {
349 if (Succ == DefBlock) {
350 if (MBB == VisitedPostDom)
351 FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
352 else
353 FoundLoopLevel = std::min(FoundLoopLevel, Level);
354 continue;
355 }
356
357 if (Visited.try_emplace(Succ, ~0u).second) {
358 if (MBB == VisitedPostDom)
359 NextLevel.push_back(Succ);
360 else
361 Stack.push_back(Succ);
362 }
363 }
364 }
365
366 CommonDominators.push_back(VisitedDom);
367 }
368};
369
370} // End anonymous namespace.
371
374 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
375 return MRI->createVirtualRegister(LaneMaskRegAttrs);
376}
377
378static Register
380 MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs) {
381 MachineFunction &MF = *MBB->getParent();
382 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
383 const SIInstrInfo *TII = ST.getInstrInfo();
384 Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
385 BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
386 UndefReg);
387 return UndefReg;
388}
389
390#ifndef NDEBUG
392 const MachineRegisterInfo &MRI,
393 Register Reg) {
394 unsigned Size = TRI.getRegSizeInBits(Reg, MRI);
395 return Size == 1 || Size == 32;
396}
397#endif
398
399bool Vreg1LoweringHelper::lowerCopiesFromI1() {
400 bool Changed = false;
401 SmallVector<MachineInstr *, 4> DeadCopies;
402
403 for (MachineBasicBlock &MBB : *MF) {
404 for (MachineInstr &MI : MBB) {
405 if (MI.getOpcode() != AMDGPU::COPY)
406 continue;
407
408 Register DstReg = MI.getOperand(0).getReg();
409 Register SrcReg = MI.getOperand(1).getReg();
410 if (!isVreg1(SrcReg))
411 continue;
412
413 if (isLaneMaskReg(DstReg) || isVreg1(DstReg))
414 continue;
415
416 Changed = true;
417
418 // Copy into a 32-bit vector register.
419 LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
420 const DebugLoc &DL = MI.getDebugLoc();
421
423 assert(!MI.getOperand(0).getSubReg());
424
425 ConstrainRegs.insert(SrcReg);
426 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
427 .addImm(0)
428 .addImm(0)
429 .addImm(0)
430 .addImm(-1)
431 .addReg(SrcReg);
432 DeadCopies.push_back(&MI);
433 }
434
435 for (MachineInstr *MI : DeadCopies)
436 MI->eraseFromParent();
437 DeadCopies.clear();
438 }
439 return Changed;
440}
441
445 : MF(MF), DT(DT), PDT(PDT), ST(&MF->getSubtarget<GCNSubtarget>()),
446 LMC(&AMDGPU::LaneMaskConstants::get(*ST)) {
447 MRI = &MF->getRegInfo();
448
449 TII = ST->getInstrInfo();
450}
451
454 LoopFinder LF(*DT, *PDT);
455 PhiIncomingAnalysis PIA(*PDT, TII);
457 SmallVector<Incoming, 4> Incomings;
458
459 getCandidatesForLowering(Vreg1Phis);
460 if (Vreg1Phis.empty())
461 return false;
462
463 DT->updateDFSNumbers();
464 MachineBasicBlock *PrevMBB = nullptr;
465 for (MachineInstr *MI : Vreg1Phis) {
466 MachineBasicBlock &MBB = *MI->getParent();
467 if (&MBB != PrevMBB) {
468 LF.initialize(MBB);
469 PrevMBB = &MBB;
470 }
471
472 LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI);
473
474 Register DstReg = MI->getOperand(0).getReg();
475 markAsLaneMask(DstReg);
477
479
480 // Sort the incomings such that incoming values that dominate other incoming
481 // values are sorted earlier. This allows us to do some amount of on-the-fly
482 // constant folding.
483 // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block.
484 llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) {
485 return DT->getNode(LHS.Block)->getDFSNumIn() <
486 DT->getNode(RHS.Block)->getDFSNumIn();
487 });
488
489#ifndef NDEBUG
490 PhiRegisters.insert(DstReg);
491#endif
492
493 // Phis in a loop that are observed outside the loop receive a simple but
494 // conservatively correct treatment.
495 std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
496 for (MachineInstr &Use : MRI->use_instructions(DstReg))
497 DomBlocks.push_back(Use.getParent());
498
499 MachineBasicBlock *PostDomBound =
500 PDT->findNearestCommonDominator(DomBlocks);
501
502 // FIXME: This fails to find irreducible cycles. If we have a def (other
503 // than a constant) in a pair of blocks that end up looping back to each
504 // other, it will be mishandle. Due to structurization this shouldn't occur
505 // in practice.
506 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
507
508 SSAUpdater.Initialize(DstReg);
509
510 if (FoundLoopLevel) {
511 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs,
512 Incomings);
513
514 for (auto &Incoming : Incomings) {
517 }
518
519 for (auto &Incoming : Incomings) {
524 }
525 } else {
526 // The phi is not observed from outside a loop. Use a more accurate
527 // lowering.
528 PIA.analyze(MBB, Incomings);
529
530 for (MachineBasicBlock *MBB : PIA.predecessors())
533
534 for (auto &Incoming : Incomings) {
536 if (PIA.isSource(IMBB)) {
539 } else {
542 }
543 }
544
545 for (auto &Incoming : Incomings) {
547 continue;
548
553 }
554 }
555
557 if (NewReg != DstReg) {
558 replaceDstReg(NewReg, DstReg, &MBB);
559 MI->eraseFromParent();
560 }
561
562 Incomings.clear();
563 }
564 return true;
565}
566
567bool Vreg1LoweringHelper::lowerCopiesToI1() {
568 bool Changed = false;
570 LoopFinder LF(*DT, *PDT);
572
573 for (MachineBasicBlock &MBB : *MF) {
574 LF.initialize(MBB);
575
576 for (MachineInstr &MI : MBB) {
577 if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
578 MI.getOpcode() != AMDGPU::COPY)
579 continue;
580
581 Register DstReg = MI.getOperand(0).getReg();
582 if (!isVreg1(DstReg))
583 continue;
584
585 Changed = true;
586
587 if (MRI->use_empty(DstReg)) {
588 DeadCopies.push_back(&MI);
589 continue;
590 }
591
592 LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
593
594 markAsLaneMask(DstReg);
595 initializeLaneMaskRegisterAttributes(DstReg);
596
597 if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
598 continue;
599
600 const DebugLoc &DL = MI.getDebugLoc();
601 Register SrcReg = MI.getOperand(1).getReg();
602 assert(!MI.getOperand(1).getSubReg());
603
604 if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
605 assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
606 Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
607 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
608 .addReg(SrcReg)
609 .addImm(0);
610 MI.getOperand(1).setReg(TmpReg);
611 SrcReg = TmpReg;
612 } else {
613 // SrcReg needs to be live beyond copy.
614 MI.getOperand(1).setIsKill(false);
615 }
616
617 // Defs in a loop that are observed outside the loop must be transformed
618 // into appropriate bit manipulation.
619 std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
620 for (MachineInstr &Use : MRI->use_instructions(DstReg))
621 DomBlocks.push_back(Use.getParent());
622
623 MachineBasicBlock *PostDomBound =
624 PDT->findNearestCommonDominator(DomBlocks);
625 unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
626 if (FoundLoopLevel) {
627 SSAUpdater.Initialize(DstReg);
629 LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs);
630
631 buildMergeLaneMasks(MBB, MI, DL, DstReg,
633 DeadCopies.push_back(&MI);
634 }
635 }
636
637 for (MachineInstr *MI : DeadCopies)
638 MI->eraseFromParent();
639 DeadCopies.clear();
640 }
641 return Changed;
642}
643
645 const MachineInstr *MI;
646 for (;;) {
647 MI = MRI->getUniqueVRegDef(Reg);
648 if (MI->getOpcode() == AMDGPU::IMPLICIT_DEF)
649 return true;
650
651 if (MI->getOpcode() != AMDGPU::COPY)
652 break;
653
654 Reg = MI->getOperand(1).getReg();
655 if (!Reg.isVirtual())
656 return false;
657 if (!isLaneMaskReg(Reg))
658 return false;
659 }
660
661 if (MI->getOpcode() != LMC->MovOpc)
662 return false;
663
664 if (!MI->getOperand(1).isImm())
665 return false;
666
667 int64_t Imm = MI->getOperand(1).getImm();
668 if (Imm == 0) {
669 Val = false;
670 return true;
671 }
672 if (Imm == -1) {
673 Val = true;
674 return true;
675 }
676
677 return false;
678}
679
680static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
681 Def = false;
682 Use = false;
683
684 for (const MachineOperand &MO : MI.operands()) {
685 if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
686 if (MO.isUse())
687 Use = true;
688 else
689 Def = true;
690 }
691 }
692}
693
694/// Return a point at the end of the given \p MBB to insert SALU instructions
695/// for lane mask calculation. Take terminators and SCC into account.
698 auto InsertionPt = MBB.getFirstTerminator();
699 bool TerminatorsUseSCC = false;
700 for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
701 bool DefsSCC;
702 instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
703 if (TerminatorsUseSCC || DefsSCC)
704 break;
705 }
706
707 if (!TerminatorsUseSCC)
708 return InsertionPt;
709
710 while (InsertionPt != MBB.begin()) {
711 InsertionPt--;
712
713 bool DefSCC, UseSCC;
714 instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
715 if (DefSCC)
716 return InsertionPt;
717 }
718
719 // We should have at least seen an IMPLICIT_DEF or COPY
720 llvm_unreachable("SCC used by terminator but no def in block");
721}
722
723// VReg_1 -> SReg_32 or SReg_64
724void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const {
725 MRI->setRegClass(DstReg, ST->getBoolRC());
726}
727
728void Vreg1LoweringHelper::getCandidatesForLowering(
729 SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
730 for (MachineBasicBlock &MBB : *MF) {
731 for (MachineInstr &MI : MBB.phis()) {
732 if (isVreg1(MI.getOperand(0).getReg()))
733 Vreg1Phis.push_back(&MI);
734 }
735 }
736}
737
738void Vreg1LoweringHelper::collectIncomingValuesFromPhi(
739 const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
740 for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
741 assert(i + 1 < MI->getNumOperands());
742 Register IncomingReg = MI->getOperand(i).getReg();
743 MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB();
744 MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
745
746 if (IncomingDef->getOpcode() == AMDGPU::COPY) {
747 IncomingReg = IncomingDef->getOperand(1).getReg();
748 assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
749 assert(!IncomingDef->getOperand(1).getSubReg());
750 } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
751 continue;
752 } else {
753 assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
754 }
755
756 Incomings.emplace_back(IncomingReg, IncomingMBB, Register());
757 }
758}
759
760void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
762 MRI->replaceRegWith(NewReg, OldReg);
763}
764
765void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
767 const DebugLoc &DL,
768 Register DstReg, Register PrevReg,
769 Register CurReg) {
770 bool PrevVal = false;
771 bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
772 bool CurVal = false;
773 bool CurConstant = isConstantLaneMask(CurReg, CurVal);
774
775 if (PrevConstant && CurConstant) {
776 if (PrevVal == CurVal) {
777 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
778 } else if (CurVal) {
779 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(LMC->ExecReg);
780 } else {
781 BuildMI(MBB, I, DL, TII->get(LMC->XorOpc), DstReg)
782 .addReg(LMC->ExecReg)
783 .addImm(-1);
784 }
785 return;
786 }
787
788 Register PrevMaskedReg;
789 Register CurMaskedReg;
790 if (!PrevConstant) {
791 if (CurConstant && CurVal) {
792 PrevMaskedReg = PrevReg;
793 } else {
794 PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
795 BuildMI(MBB, I, DL, TII->get(LMC->AndN2Opc), PrevMaskedReg)
796 .addReg(PrevReg)
797 .addReg(LMC->ExecReg);
798 }
799 }
800 if (!CurConstant) {
801 // TODO: check whether CurReg is already masked by EXEC
802 if (PrevConstant && PrevVal) {
803 CurMaskedReg = CurReg;
804 } else {
805 CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
806 BuildMI(MBB, I, DL, TII->get(LMC->AndOpc), CurMaskedReg)
807 .addReg(CurReg)
808 .addReg(LMC->ExecReg);
809 }
810 }
811
812 if (PrevConstant && !PrevVal) {
813 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
814 .addReg(CurMaskedReg);
815 } else if (CurConstant && !CurVal) {
816 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
817 .addReg(PrevMaskedReg);
818 } else if (PrevConstant && PrevVal) {
819 BuildMI(MBB, I, DL, TII->get(LMC->OrN2Opc), DstReg)
820 .addReg(CurMaskedReg)
821 .addReg(LMC->ExecReg);
822 } else {
823 BuildMI(MBB, I, DL, TII->get(LMC->OrOpc), DstReg)
824 .addReg(PrevMaskedReg)
825 .addReg(CurMaskedReg ? CurMaskedReg : LMC->ExecReg);
826 }
827}
828
829void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {}
830
831/// Lower all instructions that def or use vreg_1 registers.
832///
833/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
834/// occur around inline assembly. We do this first, before vreg_1 registers
835/// are changed to scalar mask registers.
836///
837/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
838/// all others, because phi lowering looks through copies and can therefore
839/// often make copy lowering unnecessary.
842 // Only need to run this in SelectionDAG path.
843 if (MF.getProperties().hasSelected())
844 return false;
845
846 Vreg1LoweringHelper Helper(&MF, &MDT, &MPDT);
847 bool Changed = false;
848 Changed |= Helper.lowerCopiesFromI1();
849 Changed |= Helper.lowerPhis();
850 Changed |= Helper.lowerCopiesToI1();
851 return Helper.cleanConstrainRegs(Changed);
852}
853
867
869public:
870 static char ID;
871
873
874 bool runOnMachineFunction(MachineFunction &MF) override;
875
876 StringRef getPassName() const override { return "SI Lower i1 Copies"; }
877
884};
885
893
895 false, false)
900
901char SILowerI1CopiesLegacy::ID = 0;
902
904
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition MD5.cpp:57
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use)
static Register insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
static bool runFixI1Copies(MachineFunction &MF, MachineDominatorTree &MDT, MachinePostDominatorTree &MPDT)
Lower all instructions that def or use vreg_1 registers.
static bool isVRegCompatibleReg(const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI, Register Reg)
Interface definition of the PhiLoweringHelper class that implements lane mask merging algorithm for d...
#define LLVM_DEBUG(...)
Definition Debug.h:114
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
StringRef getPassName() const override
getPassName - Return a nice clean name for a pass.
bool runOnMachineFunction(MachineFunction &MF) override
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
A debug info location.
Definition DebugLoc.h:123
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
iterator end()
Definition DenseMap.h:81
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
DomTreeNodeBase * getIDom() const
NodeT * getBlock() const
NodeT * findNearestCommonDominator(NodeT *A, NodeT *B) const
Find nearest common dominator basic block for basic block A and B.
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
const HexagonRegisterInfo & getRegisterInfo() const
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
MachineInstrBundleIterator< MachineInstr > iterator
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
const MachineFunctionProperties & getProperties() const
Get the function properties.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
Register getReg() const
getReg - Returns the register number.
MachinePostDominatorTree - an analysis pass wrapper for DominatorTree used to compute the post-domina...
LLVM_ABI MachineBasicBlock * findNearestCommonDominator(ArrayRef< MachineBasicBlock * > Blocks) const
Returns the nearest common dominator of the given blocks.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
iterator_range< use_instr_iterator > use_instructions(Register Reg) const
bool use_empty(Register RegNo) const
use_empty - Return true if there are no instructions using the specified register.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
LLVM_ABI MachineInstr * getUniqueVRegDef(Register Reg) const
getUniqueVRegDef - Return the unique machine instr that defines the specified virtual register or nul...
MachineSSAUpdater - This class updates SSA form for a set of virtual registers defined in multiple bl...
void AddAvailableValue(MachineBasicBlock *BB, Register V)
AddAvailableValue - Indicate that a rewritten value is available at the end of the specified block wi...
iterator find(const KeyT &Key)
Definition MapVector.h:154
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition MapVector.h:116
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
PhiLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT, MachinePostDominatorTree *PDT)
bool isLaneMaskReg(Register Reg) const
MachineRegisterInfo * MRI
MachineDominatorTree * DT
DenseSet< Register > PhiRegisters
virtual void getCandidatesForLowering(SmallVectorImpl< MachineInstr * > &Vreg1Phis) const =0
virtual void constrainAsLaneMask(Incoming &In)=0
virtual void collectIncomingValuesFromPhi(const MachineInstr *MI, SmallVectorImpl< Incoming > &Incomings) const =0
virtual void markAsLaneMask(Register DstReg) const =0
MachinePostDominatorTree * PDT
const GCNSubtarget * ST
const SIInstrInfo * TII
const AMDGPU::LaneMaskConstants * LMC
MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs
MachineBasicBlock::iterator getSaluInsertionAtEnd(MachineBasicBlock &MBB) const
Return a point at the end of the given MBB to insert SALU instructions for lane mask calculation.
void initializeLaneMaskRegisterAttributes(Register LaneMask)
bool isConstantLaneMask(Register Reg, bool &Val) const
virtual void buildMergeLaneMasks(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, Register PrevReg, Register CurReg)=0
virtual void replaceDstReg(Register NewReg, Register OldReg, MachineBasicBlock *MBB)=0
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
Register createLaneMaskReg(MachineRegisterInfo *MRI, MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs)
DomTreeNodeBase< MachineBasicBlock > MachineDomTreeNode
ArrayRef(const T &OneElt) -> ArrayRef< T >
FunctionPass * createSILowerI1CopiesLegacyPass()
char & SILowerI1CopiesLegacyID
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Incoming for lane mask phi as machine instruction, incoming register Reg and incoming block Block are...
MachineBasicBlock * Block
All attributes(register class or bank and low-level type) a virtual register can have.