Bug Summary

File:build/source/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
Warning:line 569, column 9
Called C++ object pointer is uninitialized

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUIGroupLP.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/source/llvm/lib/Target/AMDGPU -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
1//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file This file defines a set of schedule DAG mutations that can be used to
10// override default scheduler behavior to enforce specific scheduling patterns.
11// They should be used in cases where runtime performance considerations such as
12// inter-wavefront interactions, mean that compile-time heuristics cannot
13// predict the optimal instruction ordering, or in kernels where optimum
14// instruction scheduling is important enough to warrant manual intervention.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPUIGroupLP.h"
19#include "AMDGPUTargetMachine.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "SIInstrInfo.h"
22#include "SIMachineFunctionInfo.h"
23#include "llvm/ADT/BitmaskEnum.h"
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/CodeGen/MachineScheduler.h"
26#include "llvm/CodeGen/TargetOpcodes.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE"igrouplp" "igrouplp"
31
32namespace {
33
34static cl::opt<bool> EnableExactSolver(
35 "amdgpu-igrouplp-exact-solver", cl::Hidden,
36 cl::desc("Whether to use the exponential time solver to fit "
37 "the instructions to the pipeline as closely as "
38 "possible."),
39 cl::init(false));
40
41static cl::opt<unsigned> CutoffForExact(
42 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
43 cl::desc("The maximum number of scheduling group conflicts "
44 "which we attempt to solve with the exponential time "
45 "exact solver. Problem sizes greater than this will"
46 "be solved by the less accurate greedy algorithm. Selecting "
47 "solver by size is superseded by manually selecting "
48 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
49
50static cl::opt<uint64_t> MaxBranchesExplored(
51 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
52 cl::desc("The amount of branches that we are willing to explore with"
53 "the exact algorithm before giving up."));
54
55static cl::opt<bool> UseCostHeur(
56 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
57 cl::desc("Whether to use the cost heuristic to make choices as we "
58 "traverse the search space using the exact solver. Defaulted "
59 "to on, and if turned off, we will use the node order -- "
60 "attempting to put the later nodes in the later sched groups. "
61 "Experimentally, results are mixed, so this should be set on a "
62 "case-by-case basis."));
63
64// Components of the mask that determines which instruction types may be may be
65// classified into a SchedGroup.
66enum class SchedGroupMask {
67 NONE = 0u,
68 ALU = 1u << 0,
69 VALU = 1u << 1,
70 SALU = 1u << 2,
71 MFMA = 1u << 3,
72 VMEM = 1u << 4,
73 VMEM_READ = 1u << 5,
74 VMEM_WRITE = 1u << 6,
75 DS = 1u << 7,
76 DS_READ = 1u << 8,
77 DS_WRITE = 1u << 9,
78 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
79 DS_READ | DS_WRITE,
80 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)LLVM_BITMASK_LARGEST_ENUMERATOR = ALL
81};
82
83typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
84
85// Classify instructions into groups to enable fine tuned control over the
86// scheduler. These groups may be more specific than current SchedModel
87// instruction classes.
88class SchedGroup {
89private:
90 // Mask that defines which instruction types can be classified into this
91 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
92 // and SCHED_GROUP_BARRIER.
93 SchedGroupMask SGMask;
94
95 // Maximum number of SUnits that can be added to this group.
96 std::optional<unsigned> MaxSize;
97
98 // SchedGroups will only synchronize with other SchedGroups that have the same
99 // SyncID.
100 int SyncID = 0;
101
102 // SGID is used to map instructions to candidate SchedGroups
103 unsigned SGID;
104
105 // Count of the number of created SchedGroups, used to initialize SGID.
106 static unsigned NumSchedGroups;
107
108 ScheduleDAGInstrs *DAG;
109
110 const SIInstrInfo *TII;
111
112 // Try to add and edge from SU A to SU B.
113 bool tryAddEdge(SUnit *A, SUnit *B);
114
115 // Use SGMask to determine whether we can classify MI as a member of this
116 // SchedGroup object.
117 bool canAddMI(const MachineInstr &MI) const;
118
119public:
120 // Collection of SUnits that are classified as members of this group.
121 SmallVector<SUnit *, 32> Collection;
122
123 // Returns true if SU can be added to this SchedGroup.
124 bool canAddSU(SUnit &SU) const;
125
126 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
127 // MakePred is true, SU will be a predecessor of the SUnits in this
128 // SchedGroup, otherwise SU will be a successor.
129 void link(SUnit &SU, bool MakePred = false);
130
131 // Add DAG dependencies and track which edges are added, and the count of
132 // missed edges
133 int link(SUnit &SU, bool MakePred,
134 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
135
136 // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
137 // Use the predicate to determine whether SU should be a predecessor (P =
138 // true) or a successor (P = false) of this SchedGroup.
139 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
140
141 // Add DAG dependencies such that SUnits in this group shall be ordered
142 // before SUnits in OtherGroup.
143 void link(SchedGroup &OtherGroup);
144
145 // Returns true if no more instructions may be added to this group.
146 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
147
148 // Add SU to the SchedGroup.
149 void add(SUnit &SU) {
150 LLVM_DEBUG(dbgs() << "For SchedGroup with mask "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << " adding " <<
*SU.getInstr(); } } while (false)
151 << format_hex((int)SGMask, 10, true) << " adding "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << " adding " <<
*SU.getInstr(); } } while (false)
152 << *SU.getInstr())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << " adding " <<
*SU.getInstr(); } } while (false)
;
153 Collection.push_back(&SU);
154 }
155
156 // Remove last element in the SchedGroup
157 void pop() { Collection.pop_back(); }
158
159 // Identify and add all relevant SUs from the DAG to this SchedGroup.
160 void initSchedGroup();
161
162 // Add instructions to the SchedGroup bottom up starting from RIter.
163 // PipelineInstrs is a set of instructions that should not be added to the
164 // SchedGroup even when the other conditions for adding it are satisfied.
165 // RIter will be added to the SchedGroup as well, and dependencies will be
166 // added so that RIter will always be scheduled at the end of the group.
167 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
168 SUnitsToCandidateSGsMap &SyncedInstrs);
169
170 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
171
172 int getSyncID() { return SyncID; }
173
174 int getSGID() { return SGID; }
175
176 SchedGroupMask getMask() { return SGMask; }
177
178 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
179 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
180 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
181 SGID = NumSchedGroups++;
182 }
183
184 SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
185 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
186 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
187 SGID = NumSchedGroups++;
188 }
189};
190
191// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
192static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
193 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||(static_cast <bool> (SU.getInstr()->getOpcode() == AMDGPU
::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER
|| SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT) ? void
(0) : __assert_fail ("SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 195, __extension__
__PRETTY_FUNCTION__))
194 SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||(static_cast <bool> (SU.getInstr()->getOpcode() == AMDGPU
::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER
|| SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT) ? void
(0) : __assert_fail ("SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 195, __extension__
__PRETTY_FUNCTION__))
195 SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT)(static_cast <bool> (SU.getInstr()->getOpcode() == AMDGPU
::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER
|| SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT) ? void
(0) : __assert_fail ("SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 195, __extension__
__PRETTY_FUNCTION__))
;
196
197 while (!SU.Preds.empty())
198 for (auto &P : SU.Preds)
199 SU.removePred(P);
200
201 while (!SU.Succs.empty())
202 for (auto &S : SU.Succs)
203 for (auto &SP : S.getSUnit()->Preds)
204 if (SP.getSUnit() == &SU)
205 S.getSUnit()->removePred(SP);
206}
207
208typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
209typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec;
210
211// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
212// in non-trivial cases. For example, if the requested pipeline is
213// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
214// in the DAG, then we will have an instruction that can not be trivially
215// assigned to a SchedGroup. The PipelineSolver class implements two algorithms
216// to find a good solution to the pipeline -- a greedy algorithm and an exact
217// algorithm. The exact algorithm has an exponential time complexity and should
218// only be used for small sized problems or medium sized problems where an exact
219// solution is highly desired.
220class PipelineSolver {
221 ScheduleDAGMI *DAG;
222
223 // Instructions that can be assigned to multiple SchedGroups
224 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
225 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
226 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
227 // The current working pipeline
228 SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline;
229 // The pipeline that has the best solution found so far
230 SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline;
231
232 // Whether or not we actually have any SyncedInstrs to try to solve.
233 bool NeedsSolver = false;
234
235 // Compute an estimate of the size of search tree -- the true size is
236 // the product of each conflictedInst.Matches.size() across all SyncPipelines
237 unsigned computeProblemSize();
238
239 // The cost penalty of not assigning a SU to a SchedGroup
240 int MissPenalty = 0;
241
242 // Costs in terms of the number of edges we are unable to add
243 int BestCost = -1;
244 int CurrCost = 0;
245
246 // Index pointing to the conflicting instruction that is currently being
247 // fitted
248 int CurrConflInstNo = 0;
249 // Index to the pipeline that is currently being fitted
250 int CurrSyncGroupIdx = 0;
251 // The first non trivial pipeline
252 int BeginSyncGroupIdx = 0;
253
254 // How many branches we have explored
255 uint64_t BranchesExplored = 0;
256
257 // Update indices to fit next conflicting instruction
258 void advancePosition();
259 // Recede indices to attempt to find better fit for previous conflicting
260 // instruction
261 void retreatPosition();
262
263 // The exponential time algorithm which finds the provably best fit
264 bool solveExact();
265 // The polynomial time algorithm which attempts to find a good fit
266 bool solveGreedy();
267 // Whether or not the current solution is optimal
268 bool checkOptimal();
269 // Populate the ready list, prioiritizing fewest missed edges first
270 void populateReadyList(SUToCandSGsPair &CurrSU,
271 SmallVectorImpl<std::pair<int, int>> &ReadyList,
272 SmallVectorImpl<SchedGroup> &SyncPipeline);
273 // Add edges corresponding to the SchedGroups as assigned by solver
274 void makePipeline();
275 // Add the edges from the SU to the other SchedGroups in pipeline, and
276 // return the number of edges missed.
277 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
278 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
279 // Remove the edges passed via AddedEdges
280 void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
281 // Convert the passed in maps to arrays for bidirectional iterators
282 void convertSyncMapsToArrays();
283
284 void reset();
285
286public:
287 // Invoke the solver to map instructions to instruction groups. Heuristic &&
288 // command-line-option determines to use exact or greedy algorithm.
289 void solve();
290
291 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
292 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
293 ScheduleDAGMI *DAG)
294 : DAG(DAG), SyncedInstrs(SyncedInstrs),
295 SyncedSchedGroups(SyncedSchedGroups) {
296
297 for (auto &PipelineInstrs : SyncedInstrs) {
298 if (PipelineInstrs.second.size() > 0) {
299 NeedsSolver = true;
300 break;
301 }
302 }
303
304 if (!NeedsSolver)
305 return;
306
307 convertSyncMapsToArrays();
308
309 CurrPipeline = BestPipeline;
310
311 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
312 PipelineInstrs[BeginSyncGroupIdx].size() == 0)
313 ++BeginSyncGroupIdx;
314
315 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
316 return;
317 }
318};
319
320void PipelineSolver::reset() {
321
322 for (auto &SyncPipeline : CurrPipeline) {
323 for (auto &SG : SyncPipeline) {
324 SmallVector<SUnit *, 32> TempCollection = SG.Collection;
325 SG.Collection.clear();
326 auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
327 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
328 });
329 if (SchedBarr != TempCollection.end())
330 SG.Collection.push_back(*SchedBarr);
331 }
332 }
333
334 CurrSyncGroupIdx = BeginSyncGroupIdx;
335 CurrConflInstNo = 0;
336 CurrCost = 0;
337}
338
339void PipelineSolver::convertSyncMapsToArrays() {
340 for (auto &SyncPipe : SyncedSchedGroups) {
341 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
342 }
343
344 int PipelineIDx = SyncedInstrs.size() - 1;
345 PipelineInstrs.resize(SyncedInstrs.size());
346 for (auto &SyncInstrMap : SyncedInstrs) {
347 for (auto &SUsToCandSGs : SyncInstrMap.second) {
348 if (PipelineInstrs[PipelineIDx].size() == 0) {
349 PipelineInstrs[PipelineIDx].push_back(
350 std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
351 continue;
352 }
353 auto SortPosition = PipelineInstrs[PipelineIDx].begin();
354 // Insert them in sorted order -- this allows for good parsing order in
355 // the greedy algorithm
356 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
357 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
358 ++SortPosition;
359 PipelineInstrs[PipelineIDx].insert(
360 SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second));
361 }
362 --PipelineIDx;
363 }
364}
365
366void PipelineSolver::makePipeline() {
367 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
368 for (auto &SyncPipeline : BestPipeline) {
369 for (auto &SG : SyncPipeline) {
370 SUnit *SGBarr = nullptr;
371 for (auto &SU : SG.Collection) {
372 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
373 SGBarr = SU;
374 }
375 // Command line requested IGroupLP doesn't have SGBarr
376 if (!SGBarr)
377 continue;
378 resetEdges(*SGBarr, DAG);
379 SG.link(*SGBarr, false);
380 }
381 }
382
383 for (auto &SyncPipeline : BestPipeline) {
384 auto I = SyncPipeline.rbegin();
385 auto E = SyncPipeline.rend();
386 for (; I != E; ++I) {
387 auto &GroupA = *I;
388 for (auto J = std::next(I); J != E; ++J) {
389 auto &GroupB = *J;
390 GroupA.link(GroupB);
391 }
392 }
393 }
394}
395
396int PipelineSolver::addEdges(
397 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
398 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
399 int AddedCost = 0;
400 bool MakePred = false;
401
402 // The groups in the pipeline are in reverse order. Thus,
403 // by traversing them from last to first, we are traversing
404 // them in the order as they were introduced in the code. After we
405 // pass the group the SU is being assigned to, it should be
406 // linked as a predecessor of the subsequent SchedGroups
407 auto GroupNo = (int)SyncPipeline.size() - 1;
408 for (; GroupNo >= 0; GroupNo--) {
409 if (SyncPipeline[GroupNo].getSGID() == SGID) {
410 MakePred = true;
411 continue;
412 }
413 auto Group = &SyncPipeline[GroupNo];
414 AddedCost += Group->link(*SU, MakePred, AddedEdges);
415 assert(AddedCost >= 0)(static_cast <bool> (AddedCost >= 0) ? void (0) : __assert_fail
("AddedCost >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 415, __extension__ __PRETTY_FUNCTION__))
;
416 }
417
418 return AddedCost;
419}
420
421void PipelineSolver::removeEdges(
422 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
423 // Only remove the edges that we have added when testing
424 // the fit.
425 for (auto &PredSuccPair : EdgesToRemove) {
426 SUnit *Pred = PredSuccPair.first;
427 SUnit *Succ = PredSuccPair.second;
428
429 auto Match = llvm::find_if(
430 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
431 if (Match != Succ->Preds.end()) {
432 assert(Match->isArtificial())(static_cast <bool> (Match->isArtificial()) ? void (
0) : __assert_fail ("Match->isArtificial()", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 432, __extension__ __PRETTY_FUNCTION__))
;
433 Succ->removePred(*Match);
434 }
435 }
436}
437
438void PipelineSolver::advancePosition() {
439 ++CurrConflInstNo;
440
441 if (static_cast<size_t>(CurrConflInstNo) >=
442 PipelineInstrs[CurrSyncGroupIdx].size()) {
443 CurrConflInstNo = 0;
444 ++CurrSyncGroupIdx;
445 // Advance to next non-trivial pipeline
446 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
447 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
448 ++CurrSyncGroupIdx;
449 }
450}
451
452void PipelineSolver::retreatPosition() {
453 assert(CurrConflInstNo >= 0)(static_cast <bool> (CurrConflInstNo >= 0) ? void (0
) : __assert_fail ("CurrConflInstNo >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 453, __extension__ __PRETTY_FUNCTION__))
;
454 assert(CurrSyncGroupIdx >= 0)(static_cast <bool> (CurrSyncGroupIdx >= 0) ? void (
0) : __assert_fail ("CurrSyncGroupIdx >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 454, __extension__ __PRETTY_FUNCTION__))
;
455
456 if (CurrConflInstNo > 0) {
457 --CurrConflInstNo;
458 return;
459 }
460
461 if (CurrConflInstNo == 0) {
462 // If we return to the starting position, we have explored
463 // the entire tree
464 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
465 return;
466
467 --CurrSyncGroupIdx;
468 // Go to previous non-trivial pipeline
469 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
470 --CurrSyncGroupIdx;
471
472 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
473 }
474}
475
476bool PipelineSolver::checkOptimal() {
477 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
478 if (BestCost == -1 || CurrCost < BestCost) {
479 BestPipeline = CurrPipeline;
480 BestCost = CurrCost;
481 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Found Fit with cost " <<
BestCost << "\n"; } } while (false)
;
482 }
483 assert(BestCost >= 0)(static_cast <bool> (BestCost >= 0) ? void (0) : __assert_fail
("BestCost >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 483, __extension__ __PRETTY_FUNCTION__))
;
484 }
485
486 bool DoneExploring = false;
487 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
488 DoneExploring = true;
489
490 return (DoneExploring || BestCost == 0);
491}
492
493void PipelineSolver::populateReadyList(
494 SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,
495 SmallVectorImpl<SchedGroup> &SyncPipeline) {
496 assert(CurrSU.second.size() >= 1)(static_cast <bool> (CurrSU.second.size() >= 1) ? void
(0) : __assert_fail ("CurrSU.second.size() >= 1", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 496, __extension__ __PRETTY_FUNCTION__))
;
497 auto I = CurrSU.second.rbegin();
498 auto E = CurrSU.second.rend();
499 for (; I != E; ++I) {
500 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
501 int CandSGID = *I;
502 SchedGroup *Match;
503 for (auto &SG : SyncPipeline) {
504 if (SG.getSGID() == CandSGID)
505 Match = &SG;
506 }
507
508 if (UseCostHeur) {
509 if (Match->isFull()) {
510 ReadyList.push_back(std::pair(*I, MissPenalty));
511 continue;
512 }
513
514 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
515 ReadyList.push_back(std::pair(*I, TempCost));
516 removeEdges(AddedEdges);
517 } else
518 ReadyList.push_back(std::pair(*I, -1));
519 }
520
521 if (UseCostHeur) {
522 std::sort(ReadyList.begin(), ReadyList.end(),
523 [](std::pair<int, int> A, std::pair<int, int> B) {
524 return A.second < B.second;
525 });
526 }
527
528 assert(ReadyList.size() == CurrSU.second.size())(static_cast <bool> (ReadyList.size() == CurrSU.second.
size()) ? void (0) : __assert_fail ("ReadyList.size() == CurrSU.second.size()"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 528, __extension__
__PRETTY_FUNCTION__))
;
529}
530
531bool PipelineSolver::solveExact() {
532 if (checkOptimal())
24
Taking false branch
533 return true;
534
535 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
25
Taking false branch
536 return false;
537
538 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size())(static_cast <bool> (static_cast<size_t>(CurrSyncGroupIdx
) < PipelineInstrs.size()) ? void (0) : __assert_fail ("static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 538, __extension__
__PRETTY_FUNCTION__))
;
26
Assuming the condition is true
27
'?' condition is true
539 assert(static_cast<size_t>(CurrConflInstNo) <(static_cast <bool> (static_cast<size_t>(CurrConflInstNo
) < PipelineInstrs[CurrSyncGroupIdx].size()) ? void (0) : __assert_fail
("static_cast<size_t>(CurrConflInstNo) < PipelineInstrs[CurrSyncGroupIdx].size()"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 540, __extension__
__PRETTY_FUNCTION__))
28
Assuming the condition is true
29
'?' condition is true
540 PipelineInstrs[CurrSyncGroupIdx].size())(static_cast <bool> (static_cast<size_t>(CurrConflInstNo
) < PipelineInstrs[CurrSyncGroupIdx].size()) ? void (0) : __assert_fail
("static_cast<size_t>(CurrConflInstNo) < PipelineInstrs[CurrSyncGroupIdx].size()"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 540, __extension__
__PRETTY_FUNCTION__))
;
541 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
542 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNumdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU
.first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx
<< "\n"; } } while (false)
30
Assuming 'DebugFlag' is false
31
Loop condition is false. Exiting loop
543 << ") in Pipeline # " << CurrSyncGroupIdx << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU
.first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx
<< "\n"; } } while (false)
;
544
545 // SchedGroup -> Cost pairs
546 SmallVector<std::pair<int, int>, 4> ReadyList;
547 // Prioritize the candidate sched groups in terms of lowest cost first
548 populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
549
550 auto I = ReadyList.begin();
551 auto E = ReadyList.end();
552 for (; I != E; ++I) {
32
Assuming 'I' is not equal to 'E'
553 // If we are trying SGs in least cost order, and the current SG is cost
554 // infeasible, then all subsequent SGs will also be cost infeasible, so we
555 // can prune.
556 if (BestCost != -1 && (CurrCost + I->second > BestCost))
33
Assuming the condition is false
557 return false;
558
559 int CandSGID = I->first;
560 int AddedCost = 0;
561 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
562 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
563 SchedGroup *Match;
34
'Match' declared without an initial value
564 for (auto &SG : SyncPipeline) {
35
Assuming '__begin2' is equal to '__end2'
565 if (SG.getSGID() == CandSGID)
566 Match = &SG;
567 }
568
569 if (Match->isFull())
36
Called C++ object pointer is uninitialized
570 continue;
571
572 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " <<
CandSGID << "\n"; } } while (false)
573 << (int)Match->getMask() << "and ID " << CandSGIDdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " <<
CandSGID << "\n"; } } while (false)
574 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " <<
CandSGID << "\n"; } } while (false)
;
575 Match->add(*CurrSU.first);
576 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
577 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Cost of Assignment: " <<
AddedCost << "\n"; } } while (false)
;
578 CurrCost += AddedCost;
579 advancePosition();
580 ++BranchesExplored;
581 bool FinishedExploring = false;
582 // If the Cost after adding edges is greater than a known solution,
583 // backtrack
584 if (CurrCost < BestCost || BestCost == -1) {
585 if (solveExact()) {
586 FinishedExploring = BestCost != 0;
587 if (!FinishedExploring)
588 return true;
589 }
590 }
591
592 retreatPosition();
593 CurrCost -= AddedCost;
594 removeEdges(AddedEdges);
595 Match->pop();
596 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
597 if (FinishedExploring)
598 return true;
599 }
600
601 // Try the pipeline where the current instruction is omitted
602 // Potentially if we omit a problematic instruction from the pipeline,
603 // all the other instructions can nicely fit.
604 CurrCost += MissPenalty;
605 advancePosition();
606
607 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "NOT Assigned (" << CurrSU
.first->NodeNum << ")\n"; } } while (false)
;
608
609 bool FinishedExploring = false;
610 if (CurrCost < BestCost || BestCost == -1) {
611 if (solveExact()) {
612 bool FinishedExploring = BestCost != 0;
613 if (!FinishedExploring)
614 return true;
615 }
616 }
617
618 retreatPosition();
619 CurrCost -= MissPenalty;
620 return FinishedExploring;
621}
622
623bool PipelineSolver::solveGreedy() {
624 BestCost = 0;
625 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
626
627 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
628 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
629 int BestNodeCost = -1;
630 int TempCost;
631 SchedGroup *BestGroup = nullptr;
632 int BestGroupID = -1;
633 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
634 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNumdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU
.first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx
<< "\n"; } } while (false)
635 << ") in Pipeline # " << CurrSyncGroupIdx << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU
.first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx
<< "\n"; } } while (false)
;
636
637 // Since we have added the potential SchedGroups from bottom up, but
638 // traversed the DAG from top down, parse over the groups from last to
639 // first. If we fail to do this for the greedy algorithm, the solution will
640 // likely not be good in more complex cases.
641 auto I = CurrSU.second.rbegin();
642 auto E = CurrSU.second.rend();
643 for (; I != E; ++I) {
644 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
645 int CandSGID = *I;
646 SchedGroup *Match;
647 for (auto &SG : SyncPipeline) {
648 if (SG.getSGID() == CandSGID)
649 Match = &SG;
650 }
651
652 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Trying SGID # " << CandSGID
<< " with Mask " << (int)Match->getMask() <<
"\n"; } } while (false)
653 << (int)Match->getMask() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Trying SGID # " << CandSGID
<< " with Mask " << (int)Match->getMask() <<
"\n"; } } while (false)
;
654
655 if (Match->isFull()) {
656 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "SGID # " << CandSGID <<
" is full\n"; } } while (false)
;
657 continue;
658 }
659 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
660 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Cost of Group " << TempCost
<< "\n"; } } while (false)
;
661 if (TempCost < BestNodeCost || BestNodeCost == -1) {
662 BestGroup = Match;
663 BestNodeCost = TempCost;
664 BestGroupID = CandSGID;
665 }
666 removeEdges(AddedEdges);
667 if (BestNodeCost == 0)
668 break;
669 }
670
671 if (BestGroupID != -1) {
672 BestGroup->add(*CurrSU.first);
673 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
674 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Best Group has ID: " <<
BestGroupID << " and Mask" << (int)BestGroup->
getMask() << "\n"; } } while (false)
675 << (int)BestGroup->getMask() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Best Group has ID: " <<
BestGroupID << " and Mask" << (int)BestGroup->
getMask() << "\n"; } } while (false)
;
676 BestCost += TempCost;
677 } else
678 BestCost += MissPenalty;
679
680 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
681 advancePosition();
682 }
683 BestPipeline = CurrPipeline;
684 removeEdges(AddedEdges);
685 return false;
686}
687
688unsigned PipelineSolver::computeProblemSize() {
689 unsigned ProblemSize = 0;
690 for (auto &PipeConflicts : PipelineInstrs) {
691 ProblemSize += PipeConflicts.size();
692 }
693
694 return ProblemSize;
695}
696
697void PipelineSolver::solve() {
698 if (!NeedsSolver)
10
Assuming field 'NeedsSolver' is true
11
Taking false branch
699 return;
700
701 unsigned ProblemSize = computeProblemSize();
702 assert(ProblemSize > 0)(static_cast <bool> (ProblemSize > 0) ? void (0) : __assert_fail
("ProblemSize > 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 702, __extension__ __PRETTY_FUNCTION__))
;
12
Assuming 'ProblemSize' is > 0
13
'?' condition is true
703
704 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
14
Assuming the condition is false
705 MissPenalty = (ProblemSize / 2) + 1;
706
707 LLVM_DEBUG(DAG->dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { DAG->dump(); } } while (false)
;
15
Assuming 'DebugFlag' is false
708 if (EnableExactSolver || BelowCutoff) {
16
Assuming the condition is true
709 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Starting Greedy pipeline solver\n"
; } } while (false)
;
17
Loop condition is false. Exiting loop
710 solveGreedy();
711 reset();
712 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Greedy produced best cost of "
<< BestCost << "\n"; } } while (false)
;
18
Assuming 'DebugFlag' is false
19
Loop condition is false. Exiting loop
713 if (BestCost > 0) {
20
Assuming field 'BestCost' is > 0
714 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Starting EXACT pipeline solver\n"
; } } while (false)
;
21
Taking true branch
22
Loop condition is false. Exiting loop
715 solveExact();
23
Calling 'PipelineSolver::solveExact'
716 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Exact produced best cost of "
<< BestCost << "\n"; } } while (false)
;
717 }
718 } else { // Use the Greedy Algorithm by default
719 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Starting GREEDY pipeline solver\n"
; } } while (false)
;
720 solveGreedy();
721 }
722
723 makePipeline();
724}
725
726enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 };
727
728// Implement a IGLP scheduling strategy.
729class IGLPStrategy {
730protected:
731 ScheduleDAGInstrs *DAG;
732
733 const SIInstrInfo *TII;
734
735public:
736 // Add SchedGroups to \p Pipeline to implement this Strategy.
737 virtual void applyIGLPStrategy(
738 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
739 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
740
741 // Returns true if this strategy should be applied to a ScheduleDAG.
742 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
743
744 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
745 : DAG(DAG), TII(TII) {}
746
747 virtual ~IGLPStrategy() = default;
748};
749
750class MFMASmallGemmOpt final : public IGLPStrategy {
751public:
752 void applyIGLPStrategy(
753 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
754 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
755
756 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
757
758 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
759 : IGLPStrategy(DAG, TII) {}
760};
761
762void MFMASmallGemmOpt::applyIGLPStrategy(
763 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
764 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
765 // Count the number of MFMA instructions.
766 unsigned MFMACount = 0;
767 for (const MachineInstr &I : *DAG)
768 if (TII->isMFMAorWMMA(I))
769 ++MFMACount;
770
771 const unsigned PipelineSyncID = 0;
772 SchedGroup *SG = nullptr;
773 for (unsigned I = 0; I < MFMACount * 3; ++I) {
774 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
775 SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII);
776 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
777
778 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
779 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
780 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
781 }
782}
783
784static std::unique_ptr<IGLPStrategy>
785createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
786 const SIInstrInfo *TII) {
787 switch (ID) {
788 case MFMASmallGemmOptID:
789 return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
790 }
791
792 llvm_unreachable("Unknown IGLPStrategyID")::llvm::llvm_unreachable_internal("Unknown IGLPStrategyID", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 792)
;
793}
794
795class IGroupLPDAGMutation : public ScheduleDAGMutation {
796private:
797 const SIInstrInfo *TII;
798
799 ScheduleDAGMI *DAG;
800
801 // Organize lists of SchedGroups by their SyncID. SchedGroups /
802 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
803 // between then.
804 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
805
806 // Used to track instructions that can be mapped to multiple sched groups
807 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
808
809 // Add DAG edges that enforce SCHED_BARRIER ordering.
810 void addSchedBarrierEdges(SUnit &SU);
811
812 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
813 // not be reordered accross the SCHED_BARRIER. This is used for the base
814 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
815 // SCHED_BARRIER will always block all instructions that can be classified
816 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
817 // and may only synchronize with some SchedGroups. Returns the inverse of
818 // Mask. SCHED_BARRIER's mask describes which instruction types should be
819 // allowed to be scheduled across it. Invert the mask to get the
820 // SchedGroupMask of instructions that should be barred.
821 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
822
823 // Create SchedGroups for a SCHED_GROUP_BARRIER.
824 void initSchedGroupBarrierPipelineStage(
825 std::vector<SUnit>::reverse_iterator RIter);
826
827 void initIGLPOpt(SUnit &SU);
828
829public:
830 void apply(ScheduleDAGInstrs *DAGInstrs) override;
831
832 IGroupLPDAGMutation() = default;
833};
834
835unsigned SchedGroup::NumSchedGroups = 0;
836
837bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
838 if (A != B && DAG->canAddEdge(B, A)) {
839 DAG->addEdge(B, SDep(A, SDep::Artificial));
840 return true;
841 }
842 return false;
843}
844
845bool SchedGroup::canAddMI(const MachineInstr &MI) const {
846 bool Result = false;
847 if (MI.isMetaInstruction())
848 Result = false;
849
850 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
851 (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI)))
852 Result = true;
853
854 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
855 TII->isVALU(MI) && !TII->isMFMAorWMMA(MI))
856 Result = true;
857
858 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
859 TII->isSALU(MI))
860 Result = true;
861
862 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
863 TII->isMFMAorWMMA(MI))
864 Result = true;
865
866 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
867 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
868 Result = true;
869
870 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
871 MI.mayLoad() &&
872 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
873 Result = true;
874
875 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
876 MI.mayStore() &&
877 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
878 Result = true;
879
880 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
881 TII->isDS(MI))
882 Result = true;
883
884 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
885 MI.mayLoad() && TII->isDS(MI))
886 Result = true;
887
888 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
889 MI.mayStore() && TII->isDS(MI))
890 Result = true;
891
892 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << (Result ? " could classify "
: " unable to classify ") << MI; } } while (false)
893 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << (Result ? " could classify "
: " unable to classify ") << MI; } } while (false)
894 << (Result ? " could classify " : " unable to classify ") << MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << (Result ? " could classify "
: " unable to classify ") << MI; } } while (false)
;
895
896 return Result;
897}
898
899int SchedGroup::link(SUnit &SU, bool MakePred,
900 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
901 int MissedEdges = 0;
902 for (auto *A : Collection) {
903 SUnit *B = &SU;
904 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
905 continue;
906 if (MakePred)
907 std::swap(A, B);
908
909 if (DAG->IsReachable(B, A))
910 continue;
911 // tryAddEdge returns false if there is a dependency that makes adding
912 // the A->B edge impossible, otherwise it returns true;
913 bool Added = tryAddEdge(A, B);
914 if (Added)
915 AddedEdges.push_back(std::pair(A, B));
916 else
917 ++MissedEdges;
918 }
919
920 return MissedEdges;
921}
922
923void SchedGroup::link(SUnit &SU, bool MakePred) {
924 for (auto *A : Collection) {
925 SUnit *B = &SU;
926 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
927 continue;
928 if (MakePred)
929 std::swap(A, B);
930
931 tryAddEdge(A, B);
932 }
933}
934
935void SchedGroup::link(SUnit &SU,
936 function_ref<bool(const SUnit *A, const SUnit *B)> P) {
937 for (auto *A : Collection) {
938 SUnit *B = &SU;
939 if (P(A, B))
940 std::swap(A, B);
941
942 tryAddEdge(A, B);
943 }
944}
945
946void SchedGroup::link(SchedGroup &OtherGroup) {
947 for (auto *B : OtherGroup.Collection)
948 link(*B);
949}
950
951bool SchedGroup::canAddSU(SUnit &SU) const {
952 MachineInstr &MI = *SU.getInstr();
953 if (MI.getOpcode() != TargetOpcode::BUNDLE)
954 return canAddMI(MI);
955
956 // Special case for bundled MIs.
957 const MachineBasicBlock *MBB = MI.getParent();
958 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
959 while (E != MBB->end() && E->isBundledWithPred())
960 ++E;
961
962 // Return true if all of the bundled MIs can be added to this group.
963 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
964}
965
966void SchedGroup::initSchedGroup() {
967 for (auto &SU : DAG->SUnits) {
968 if (isFull())
969 break;
970
971 if (canAddSU(SU))
972 add(SU);
973 }
974}
975
976void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
977 SUnitsToCandidateSGsMap &SyncedInstrs) {
978 SUnit &InitSU = *RIter;
979 for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
980 auto &SU = *RIter;
981 if (isFull())
982 break;
983
984 if (canAddSU(SU))
985 SyncedInstrs[&SU].push_back(SGID);
986 }
987
988 add(InitSU);
989 assert(MaxSize)(static_cast <bool> (MaxSize) ? void (0) : __assert_fail
("MaxSize", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 989
, __extension__ __PRETTY_FUNCTION__))
;
990 (*MaxSize)++;
991}
992
993void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
994 auto I = DAG->SUnits.rbegin();
995 auto E = DAG->SUnits.rend();
996 for (; I != E; ++I) {
997 auto &SU = *I;
998 if (isFull())
999 break;
1000
1001 if (canAddSU(SU))
1002 SyncedInstrs[&SU].push_back(SGID);
1003 }
1004}
1005
1006void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
1007 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1008 if (!TSchedModel || DAGInstrs->SUnits.empty())
1
Assuming 'TSchedModel' is non-null
2
Assuming the condition is false
1009 return;
1010
1011 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Applying IGroupLPDAGMutation...\n"
; } } while (false)
;
3
Taking false branch
4
Assuming 'DebugFlag' is false
5
Loop condition is false. Exiting loop
1012 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1013 TII = ST.getInstrInfo();
1014 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
1015 SyncedSchedGroups.clear();
1016 SyncedInstrs.clear();
1017 bool foundSB = false;
1018 bool foundIGLP = false;
1019 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
6
Loop condition is true. Entering loop body
1020 unsigned Opc = R->getInstr()->getOpcode();
1021 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
1022 if (Opc == AMDGPU::SCHED_BARRIER) {
7
Assuming 'Opc' is equal to SCHED_BARRIER
8
Taking true branch
1023 addSchedBarrierEdges(*R);
1024 foundSB = true;
1025 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
1026 initSchedGroupBarrierPipelineStage(R);
1027 foundSB = true;
1028 } else if (Opc == AMDGPU::IGLP_OPT) {
1029 resetEdges(*R, DAG);
1030 if (!foundSB && !foundIGLP)
1031 initIGLPOpt(*R);
1032 foundIGLP = true;
1033 }
1034 }
1035
1036 if (foundSB
8.1
'foundSB' is true
|| foundIGLP) {
1037 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);
1038 // PipelineSolver performs the mutation by adding the edges it
1039 // determined as the best
1040 PS.solve();
9
Calling 'PipelineSolver::solve'
1041 }
1042}
1043
1044void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
1045 MachineInstr &MI = *SchedBarrier.getInstr();
1046 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER)(static_cast <bool> (MI.getOpcode() == AMDGPU::SCHED_BARRIER
) ? void (0) : __assert_fail ("MI.getOpcode() == AMDGPU::SCHED_BARRIER"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 1046, __extension__
__PRETTY_FUNCTION__))
;
1047 // Remove all existing edges from the SCHED_BARRIER that were added due to the
1048 // instruction having side effects.
1049 resetEdges(SchedBarrier, DAG);
1050 auto InvertedMask =
1051 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
1052 SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
1053 SG.initSchedGroup();
1054 // Preserve original instruction ordering relative to the SCHED_BARRIER.
1055 SG.link(
1056 SchedBarrier,
1057 (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
1058 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
1059}
1060
1061SchedGroupMask
1062IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
1063 // Invert mask and erase bits for types of instructions that are implied to be
1064 // allowed past the SCHED_BARRIER.
1065 SchedGroupMask InvertedMask = ~Mask;
1066
1067 // ALU implies VALU, SALU, MFMA.
1068 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
1069 InvertedMask &=
1070 ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA;
1071 // VALU, SALU, MFMA implies ALU.
1072 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
1073 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
1074 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE)
1075 InvertedMask &= ~SchedGroupMask::ALU;
1076
1077 // VMEM implies VMEM_READ, VMEM_WRITE.
1078 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
1079 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
1080 // VMEM_READ, VMEM_WRITE implies VMEM.
1081 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
1082 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
1083 InvertedMask &= ~SchedGroupMask::VMEM;
1084
1085 // DS implies DS_READ, DS_WRITE.
1086 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
1087 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
1088 // DS_READ, DS_WRITE implies DS.
1089 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
1090 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
1091 InvertedMask &= ~SchedGroupMask::DS;
1092
1093 return InvertedMask;
1094}
1095
1096void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
1097 std::vector<SUnit>::reverse_iterator RIter) {
1098 // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
1099 // to the instruction having side effects.
1100 resetEdges(*RIter, DAG);
1101 MachineInstr &SGB = *RIter->getInstr();
1102 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)(static_cast <bool> (SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER
) ? void (0) : __assert_fail ("SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 1102, __extension__
__PRETTY_FUNCTION__))
;
1103 int32_t SGMask = SGB.getOperand(0).getImm();
1104 int32_t Size = SGB.getOperand(1).getImm();
1105 int32_t SyncID = SGB.getOperand(2).getImm();
1106
1107 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
1108 Size, SyncID, DAG, TII);
1109
1110 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
1111}
1112
1113void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
1114 IGLPStrategyID StrategyID =
1115 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
1116 auto S = createIGLPStrategy(StrategyID, DAG, TII);
1117 if (S->shouldApplyStrategy(DAG))
1118 S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
1119}
1120
1121} // namespace
1122
1123namespace llvm {
1124
1125std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
1126 return std::make_unique<IGroupLPDAGMutation>();
1127}
1128
1129} // end namespace llvm