Bug Summary

File:build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
Warning:line 656, column 11
Called C++ object pointer is uninitialized

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name AMDGPUIGroupLP.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I lib/Target/AMDGPU -I /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Target/AMDGPU -I include -I /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/= -ferror-limit 19 -fvisibility=hidden -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-10-03-140002-15933-1 -x c++ /build/llvm-toolchain-snapshot-16~++20221003111214+1fa2019828ca/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
1//===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file This file defines a set of schedule DAG mutations that can be used to
10// override default scheduler behavior to enforce specific scheduling patterns.
11// They should be used in cases where runtime performance considerations such as
12// inter-wavefront interactions, mean that compile-time heuristics cannot
13// predict the optimal instruction ordering, or in kernels where optimum
14// instruction scheduling is important enough to warrant manual intervention.
15//
16//===----------------------------------------------------------------------===//
17
18#include "AMDGPUIGroupLP.h"
19#include "AMDGPUTargetMachine.h"
20#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21#include "SIInstrInfo.h"
22#include "SIMachineFunctionInfo.h"
23#include "llvm/ADT/BitmaskEnum.h"
24#include "llvm/ADT/DenseMap.h"
25#include "llvm/CodeGen/MachineScheduler.h"
26#include "llvm/CodeGen/TargetOpcodes.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE"igrouplp" "igrouplp"
31
32namespace {
33
34static cl::opt<bool> EnableExactSolver(
35 "amdgpu-igrouplp-exact-solver", cl::Hidden,
36 cl::desc("Whether to use the exponential time solver to fit "
37 "the instructions to the pipeline as closely as "
38 "possible."),
39 cl::init(false));
40
41static cl::opt<unsigned> CutoffForExact(
42 "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden,
43 cl::desc("The maximum number of scheduling group conflicts "
44 "which we attempt to solve with the exponential time "
45 "exact solver. Problem sizes greater than this will"
46 "be solved by the less accurate greedy algorithm. Selecting "
47 "solver by size is superseded by manually selecting "
48 "the solver (e.g. by amdgpu-igrouplp-exact-solver"));
49
50static cl::opt<uint64_t> MaxBranchesExplored(
51 "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden,
52 cl::desc("The amount of branches that we are willing to explore with"
53 "the exact algorithm before giving up."));
54
55static cl::opt<bool> UseCostHeur(
56 "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden,
57 cl::desc("Whether to use the cost heuristic to make choices as we "
58 "traverse the search space using the exact solver. Defaulted "
59 "to on, and if turned off, we will use the node order -- "
60 "attempting to put the later nodes in the later sched groups. "
61 "Experimentally, results are mixed, so this should be set on a "
62 "case-by-case basis."));
63
64// Components of the mask that determines which instruction types may be may be
65// classified into a SchedGroup.
66enum class SchedGroupMask {
67 NONE = 0u,
68 ALU = 1u << 0,
69 VALU = 1u << 1,
70 SALU = 1u << 2,
71 MFMA = 1u << 3,
72 VMEM = 1u << 4,
73 VMEM_READ = 1u << 5,
74 VMEM_WRITE = 1u << 6,
75 DS = 1u << 7,
76 DS_READ = 1u << 8,
77 DS_WRITE = 1u << 9,
78 ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
79 DS_READ | DS_WRITE,
80 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)LLVM_BITMASK_LARGEST_ENUMERATOR = ALL
81};
82
83typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
84
85// Classify instructions into groups to enable fine tuned control over the
86// scheduler. These groups may be more specific than current SchedModel
87// instruction classes.
88class SchedGroup {
89private:
90 // Mask that defines which instruction types can be classified into this
91 // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER
92 // and SCHED_GROUP_BARRIER.
93 SchedGroupMask SGMask;
94
95 // Maximum number of SUnits that can be added to this group.
96 Optional<unsigned> MaxSize;
97
98 // SchedGroups will only synchronize with other SchedGroups that have the same
99 // SyncID.
100 int SyncID = 0;
101
102 // SGID is used to map instructions to candidate SchedGroups
103 unsigned SGID;
104
105 // Count of the number of created SchedGroups, used to initialize SGID.
106 static unsigned NumSchedGroups;
107
108 ScheduleDAGInstrs *DAG;
109
110 const SIInstrInfo *TII;
111
112 // Try to add and edge from SU A to SU B.
113 bool tryAddEdge(SUnit *A, SUnit *B);
114
115 // Use SGMask to determine whether we can classify MI as a member of this
116 // SchedGroup object.
117 bool canAddMI(const MachineInstr &MI) const;
118
119public:
120 // Collection of SUnits that are classified as members of this group.
121 SmallVector<SUnit *, 32> Collection;
122
123 // Returns true if SU can be added to this SchedGroup.
124 bool canAddSU(SUnit &SU) const;
125
126 // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If
127 // MakePred is true, SU will be a predecessor of the SUnits in this
128 // SchedGroup, otherwise SU will be a successor.
129 void link(SUnit &SU, bool MakePred = false);
130
131 // Add DAG dependencies and track which edges are added, and the count of
132 // missed edges
133 int link(SUnit &SU, bool MakePred,
134 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
135
136 // Add DAG dependencies from all SUnits in this SchedGroup and this SU.
137 // Use the predicate to determine whether SU should be a predecessor (P =
138 // true) or a successor (P = false) of this SchedGroup.
139 void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P);
140
141 // Add DAG dependencies such that SUnits in this group shall be ordered
142 // before SUnits in OtherGroup.
143 void link(SchedGroup &OtherGroup);
144
145 // Returns true if no more instructions may be added to this group.
146 bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
147
148 // Add SU to the SchedGroup.
149 void add(SUnit &SU) {
150 LLVM_DEBUG(dbgs() << "For SchedGroup with mask "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << " adding " <<
*SU.getInstr(); } } while (false)
151 << format_hex((int)SGMask, 10, true) << " adding "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << " adding " <<
*SU.getInstr(); } } while (false)
152 << *SU.getInstr())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << " adding " <<
*SU.getInstr(); } } while (false)
;
153 Collection.push_back(&SU);
154 }
155
156 // Remove last element in the SchedGroup
157 void pop() { Collection.pop_back(); }
158
159 // Identify and add all relevant SUs from the DAG to this SchedGroup.
160 void initSchedGroup();
161
162 // Add instructions to the SchedGroup bottom up starting from RIter.
163 // PipelineInstrs is a set of instructions that should not be added to the
164 // SchedGroup even when the other conditions for adding it are satisfied.
165 // RIter will be added to the SchedGroup as well, and dependencies will be
166 // added so that RIter will always be scheduled at the end of the group.
167 void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
168 SUnitsToCandidateSGsMap &SyncedInstrs);
169
170 void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs);
171
172 int getSyncID() { return SyncID; }
173
174 int getSGID() { return SGID; }
175
176 SchedGroupMask getMask() { return SGMask; }
177
178 SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize,
179 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
180 : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
181 SGID = NumSchedGroups++;
182 }
183
184 SchedGroup(SchedGroupMask SGMask, Optional<unsigned> MaxSize, int SyncID,
185 ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
186 : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
187 SGID = NumSchedGroups++;
188 }
189};
190
191// Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER.
192static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
193 assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||(static_cast <bool> (SU.getInstr()->getOpcode() == AMDGPU
::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER
|| SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT) ? void
(0) : __assert_fail ("SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 195, __extension__
__PRETTY_FUNCTION__))
194 SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||(static_cast <bool> (SU.getInstr()->getOpcode() == AMDGPU
::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER
|| SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT) ? void
(0) : __assert_fail ("SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 195, __extension__
__PRETTY_FUNCTION__))
195 SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT)(static_cast <bool> (SU.getInstr()->getOpcode() == AMDGPU
::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER
|| SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT) ? void
(0) : __assert_fail ("SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 195, __extension__
__PRETTY_FUNCTION__))
;
196
197 while (!SU.Preds.empty())
198 for (auto &P : SU.Preds)
199 SU.removePred(P);
200
201 while (!SU.Succs.empty())
202 for (auto &S : SU.Succs)
203 for (auto &SP : S.getSUnit()->Preds)
204 if (SP.getSUnit() == &SU)
205 S.getSUnit()->removePred(SP);
206}
207
208typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair;
209typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec;
210
211// The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline
212// in non-trivial cases. For example, if the requested pipeline is
213// {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction
214// in the DAG, then we will have an instruction that can not be trivially
215// assigned to a SchedGroup. The PipelineSolver class implements two algorithms
216// to find a good solution to the pipeline -- a greedy algorithm and an exact
217// algorithm. The exact algorithm has an exponential time complexity and should
218// only be used for small sized problems or medium sized problems where an exact
219// solution is highly desired.
220class PipelineSolver {
221 ScheduleDAGMI *DAG;
222
223 // Instructions that can be assigned to multiple SchedGroups
224 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
225 SmallVector<SUsToCandSGsVec, 4> PipelineInstrs;
226 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
227 // The current working pipeline
228 SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline;
229 // The pipeline that has the best solution found so far
230 SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline;
231
232 // Whether or not we actually have any SyncedInstrs to try to solve.
233 bool NeedsSolver = false;
234
235 // Compute an estimate of the size of search tree -- the true size is
236 // the product of each conflictedInst.Matches.size() across all SyncPipelines
237 unsigned computeProblemSize();
238
239 // The cost penalty of not assigning a SU to a SchedGroup
240 int MissPenalty = 0;
241
242 // Costs in terms of the number of edges we are unable to add
243 int BestCost = -1;
244 int CurrCost = 0;
245
246 // Index pointing to the conflicting instruction that is currently being
247 // fitted
248 int CurrConflInstNo = 0;
249 // Index to the pipeline that is currently being fitted
250 int CurrSyncGroupIdx = 0;
251 // The first non trivial pipeline
252 int BeginSyncGroupIdx = 0;
253
254 // How many branches we have explored
255 uint64_t BranchesExplored = 0;
256
257 // Update indices to fit next conflicting instruction
258 void advancePosition();
259 // Recede indices to attempt to find better fit for previous conflicting
260 // instruction
261 void retreatPosition();
262
263 // The exponential time algorithm which finds the provably best fit
264 bool solveExact();
265 // The polynomial time algorithm which attempts to find a good fit
266 bool solveGreedy();
267 // Whether or not the current solution is optimal
268 bool checkOptimal();
269 // Populate the ready list, prioiritizing fewest missed edges first
270 void populateReadyList(SUToCandSGsPair &CurrSU,
271 SmallVectorImpl<std::pair<int, int>> &ReadyList,
272 SmallVectorImpl<SchedGroup> &SyncPipeline);
273 // Add edges corresponding to the SchedGroups as assigned by solver
274 void makePipeline();
275 // Add the edges from the SU to the other SchedGroups in pipeline, and
276 // return the number of edges missed.
277 int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
278 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
279 // Remove the edges passed via AddedEdges
280 void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
281 // Convert the passed in maps to arrays for bidirectional iterators
282 void convertSyncMapsToArrays();
283
284 void reset();
285
286public:
287 // Invoke the solver to map instructions to instruction groups. Heuristic &&
288 // command-line-option determines to use exact or greedy algorithm.
289 void solve();
290
291 PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
292 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
293 ScheduleDAGMI *DAG)
294 : DAG(DAG), SyncedInstrs(SyncedInstrs),
295 SyncedSchedGroups(SyncedSchedGroups) {
296
297 for (auto &PipelineInstrs : SyncedInstrs) {
298 if (PipelineInstrs.second.size() > 0) {
299 NeedsSolver = true;
300 break;
301 }
302 }
303
304 if (!NeedsSolver)
305 return;
306
307 convertSyncMapsToArrays();
308
309 CurrPipeline = BestPipeline;
310
311 while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() &&
312 PipelineInstrs[BeginSyncGroupIdx].size() == 0)
313 ++BeginSyncGroupIdx;
314
315 if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size())
316 return;
317 }
318};
319
320void PipelineSolver::reset() {
321
322 for (auto &SyncPipeline : CurrPipeline) {
323 for (auto &SG : SyncPipeline) {
324 SmallVector<SUnit *, 32> TempCollection = SG.Collection;
325 SG.Collection.clear();
326 auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
327 return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
328 });
329 if (SchedBarr != TempCollection.end())
330 SG.Collection.push_back(*SchedBarr);
331 }
332 }
333
334 CurrSyncGroupIdx = BeginSyncGroupIdx;
335 CurrConflInstNo = 0;
336 CurrCost = 0;
337}
338
339void PipelineSolver::convertSyncMapsToArrays() {
340 for (auto &SyncPipe : SyncedSchedGroups) {
341 BestPipeline.insert(BestPipeline.begin(), SyncPipe.second);
342 }
343
344 int PipelineIDx = SyncedInstrs.size() - 1;
345 PipelineInstrs.resize(SyncedInstrs.size());
346 for (auto &SyncInstrMap : SyncedInstrs) {
347 for (auto &SUsToCandSGs : SyncInstrMap.second) {
348 if (PipelineInstrs[PipelineIDx].size() == 0) {
349 PipelineInstrs[PipelineIDx].push_back(
350 std::make_pair(SUsToCandSGs.first, SUsToCandSGs.second));
351 continue;
352 }
353 auto SortPosition = PipelineInstrs[PipelineIDx].begin();
354 // Insert them in sorted order -- this allows for good parsing order in
355 // the greedy algorithm
356 while (SortPosition != PipelineInstrs[PipelineIDx].end() &&
357 SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum)
358 ++SortPosition;
359 PipelineInstrs[PipelineIDx].insert(
360 SortPosition,
361 std::make_pair(SUsToCandSGs.first, SUsToCandSGs.second));
362 }
363 --PipelineIDx;
364 }
365}
366
367void PipelineSolver::makePipeline() {
368 // Preserve the order of barrier for subsequent SchedGroupBarrier mutations
369 for (auto &SyncPipeline : BestPipeline) {
370 for (auto &SG : SyncPipeline) {
371 SUnit *SGBarr = nullptr;
372 for (auto &SU : SG.Collection) {
373 if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
374 SGBarr = SU;
375 }
376 // Command line requested IGroupLP doesn't have SGBarr
377 if (!SGBarr)
378 continue;
379 resetEdges(*SGBarr, DAG);
380 SG.link(*SGBarr, false);
381 }
382 }
383
384 for (auto &SyncPipeline : BestPipeline) {
385 auto I = SyncPipeline.rbegin();
386 auto E = SyncPipeline.rend();
387 for (; I != E; ++I) {
388 auto &GroupA = *I;
389 for (auto J = std::next(I); J != E; ++J) {
390 auto &GroupB = *J;
391 GroupA.link(GroupB);
392 }
393 }
394 }
395}
396
397int PipelineSolver::addEdges(
398 SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
399 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
400 int AddedCost = 0;
401 bool MakePred = false;
402
403 // The groups in the pipeline are in reverse order. Thus,
404 // by traversing them from last to first, we are traversing
405 // them in the order as they were introduced in the code. After we
406 // pass the group the SU is being assigned to, it should be
407 // linked as a predecessor of the subsequent SchedGroups
408 auto GroupNo = (int)SyncPipeline.size() - 1;
409 for (; GroupNo >= 0; GroupNo--) {
410 if (SyncPipeline[GroupNo].getSGID() == SGID) {
411 MakePred = true;
412 continue;
413 }
414 auto Group = &SyncPipeline[GroupNo];
415 AddedCost += Group->link(*SU, MakePred, AddedEdges);
416 assert(AddedCost >= 0)(static_cast <bool> (AddedCost >= 0) ? void (0) : __assert_fail
("AddedCost >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 416, __extension__ __PRETTY_FUNCTION__))
;
417 }
418
419 return AddedCost;
420}
421
422void PipelineSolver::removeEdges(
423 const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
424 // Only remove the edges that we have added when testing
425 // the fit.
426 for (auto &PredSuccPair : EdgesToRemove) {
427 SUnit *Pred = PredSuccPair.first;
428 SUnit *Succ = PredSuccPair.second;
429
430 auto Match = llvm::find_if(
431 Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; });
432 if (Match != Succ->Preds.end()) {
433 assert(Match->isArtificial())(static_cast <bool> (Match->isArtificial()) ? void (
0) : __assert_fail ("Match->isArtificial()", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 433, __extension__ __PRETTY_FUNCTION__))
;
434 Succ->removePred(*Match);
435 }
436 }
437}
438
439void PipelineSolver::advancePosition() {
440 ++CurrConflInstNo;
441
442 if (static_cast<size_t>(CurrConflInstNo) >=
443 PipelineInstrs[CurrSyncGroupIdx].size()) {
444 CurrConflInstNo = 0;
445 ++CurrSyncGroupIdx;
446 // Advance to next non-trivial pipeline
447 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() &&
448 PipelineInstrs[CurrSyncGroupIdx].size() == 0)
449 ++CurrSyncGroupIdx;
450 }
451}
452
453void PipelineSolver::retreatPosition() {
454 assert(CurrConflInstNo >= 0)(static_cast <bool> (CurrConflInstNo >= 0) ? void (0
) : __assert_fail ("CurrConflInstNo >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 454, __extension__ __PRETTY_FUNCTION__))
;
455 assert(CurrSyncGroupIdx >= 0)(static_cast <bool> (CurrSyncGroupIdx >= 0) ? void (
0) : __assert_fail ("CurrSyncGroupIdx >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 455, __extension__ __PRETTY_FUNCTION__))
;
456
457 if (CurrConflInstNo > 0) {
458 --CurrConflInstNo;
459 return;
460 }
461
462 if (CurrConflInstNo == 0) {
463 // If we return to the starting position, we have explored
464 // the entire tree
465 if (CurrSyncGroupIdx == BeginSyncGroupIdx)
466 return;
467
468 --CurrSyncGroupIdx;
469 // Go to previous non-trivial pipeline
470 while (PipelineInstrs[CurrSyncGroupIdx].size() == 0)
471 --CurrSyncGroupIdx;
472
473 CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1;
474 }
475}
476
477bool PipelineSolver::checkOptimal() {
478 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) {
479 if (BestCost == -1 || CurrCost < BestCost) {
480 BestPipeline = CurrPipeline;
481 BestCost = CurrCost;
482 LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Found Fit with cost " <<
BestCost << "\n"; } } while (false)
;
483 }
484 assert(BestCost >= 0)(static_cast <bool> (BestCost >= 0) ? void (0) : __assert_fail
("BestCost >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 484, __extension__ __PRETTY_FUNCTION__))
;
485 }
486
487 bool DoneExploring = false;
488 if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored)
489 DoneExploring = true;
490
491 return (DoneExploring || BestCost == 0);
492}
493
494void PipelineSolver::populateReadyList(
495 SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,
496 SmallVectorImpl<SchedGroup> &SyncPipeline) {
497 assert(CurrSU.second.size() >= 1)(static_cast <bool> (CurrSU.second.size() >= 1) ? void
(0) : __assert_fail ("CurrSU.second.size() >= 1", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 497, __extension__ __PRETTY_FUNCTION__))
;
498 auto I = CurrSU.second.rbegin();
499 auto E = CurrSU.second.rend();
500 for (; I != E; ++I) {
501 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
502 int CandSGID = *I;
503 SchedGroup *Match;
504 for (auto &SG : SyncPipeline) {
505 if (SG.getSGID() == CandSGID)
506 Match = &SG;
507 }
508
509 if (UseCostHeur) {
510 if (Match->isFull()) {
511 ReadyList.push_back(std::make_pair(*I, MissPenalty));
512 continue;
513 }
514
515 int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
516 ReadyList.push_back(std::make_pair(*I, TempCost));
517 removeEdges(AddedEdges);
518 } else
519 ReadyList.push_back(std::make_pair(*I, -1));
520 }
521
522 if (UseCostHeur) {
523 std::sort(ReadyList.begin(), ReadyList.end(),
524 [](std::pair<int, int> A, std::pair<int, int> B) {
525 return A.second < B.second;
526 });
527 }
528
529 assert(ReadyList.size() == CurrSU.second.size())(static_cast <bool> (ReadyList.size() == CurrSU.second.
size()) ? void (0) : __assert_fail ("ReadyList.size() == CurrSU.second.size()"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 529, __extension__
__PRETTY_FUNCTION__))
;
530}
531
532bool PipelineSolver::solveExact() {
533 if (checkOptimal())
534 return true;
535
536 if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size())
537 return false;
538
539 assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size())(static_cast <bool> (static_cast<size_t>(CurrSyncGroupIdx
) < PipelineInstrs.size()) ? void (0) : __assert_fail ("static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 539, __extension__
__PRETTY_FUNCTION__))
;
540 assert(static_cast<size_t>(CurrConflInstNo) <(static_cast <bool> (static_cast<size_t>(CurrConflInstNo
) < PipelineInstrs[CurrSyncGroupIdx].size()) ? void (0) : __assert_fail
("static_cast<size_t>(CurrConflInstNo) < PipelineInstrs[CurrSyncGroupIdx].size()"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 541, __extension__
__PRETTY_FUNCTION__))
541 PipelineInstrs[CurrSyncGroupIdx].size())(static_cast <bool> (static_cast<size_t>(CurrConflInstNo
) < PipelineInstrs[CurrSyncGroupIdx].size()) ? void (0) : __assert_fail
("static_cast<size_t>(CurrConflInstNo) < PipelineInstrs[CurrSyncGroupIdx].size()"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 541, __extension__
__PRETTY_FUNCTION__))
;
542 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
543 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNumdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU
.first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx
<< "\n"; } } while (false)
544 << ") in Pipeline # " << CurrSyncGroupIdx << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU
.first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx
<< "\n"; } } while (false)
;
545
546 // SchedGroup -> Cost pairs
547 SmallVector<std::pair<int, int>, 4> ReadyList;
548 // Prioritize the candidate sched groups in terms of lowest cost first
549 populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
550
551 auto I = ReadyList.begin();
552 auto E = ReadyList.end();
553 for (; I != E; ++I) {
554 // If we are trying SGs in least cost order, and the current SG is cost
555 // infeasible, then all subsequent SGs will also be cost infeasible, so we
556 // can prune.
557 if (BestCost != -1 && (CurrCost + I->second > BestCost))
558 return false;
559
560 int CandSGID = I->first;
561 int AddedCost = 0;
562 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
563 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
564 SchedGroup *Match;
565 for (auto &SG : SyncPipeline) {
566 if (SG.getSGID() == CandSGID)
567 Match = &SG;
568 }
569
570 if (Match->isFull())
571 continue;
572
573 LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " <<
CandSGID << "\n"; } } while (false)
574 << (int)Match->getMask() << "and ID " << CandSGIDdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " <<
CandSGID << "\n"; } } while (false)
575 << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " <<
CandSGID << "\n"; } } while (false)
;
576 Match->add(*CurrSU.first);
577 AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
578 LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Cost of Assignment: " <<
AddedCost << "\n"; } } while (false)
;
579 CurrCost += AddedCost;
580 advancePosition();
581 ++BranchesExplored;
582 bool FinishedExploring = false;
583 // If the Cost after adding edges is greater than a known solution,
584 // backtrack
585 if (CurrCost < BestCost || BestCost == -1) {
586 if (solveExact()) {
587 FinishedExploring = BestCost != 0;
588 if (!FinishedExploring)
589 return true;
590 }
591 }
592
593 retreatPosition();
594 CurrCost -= AddedCost;
595 removeEdges(AddedEdges);
596 Match->pop();
597 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
598 if (FinishedExploring)
599 return true;
600 }
601
602 // Try the pipeline where the current instruction is omitted
603 // Potentially if we omit a problematic instruction from the pipeline,
604 // all the other instructions can nicely fit.
605 CurrCost += MissPenalty;
606 advancePosition();
607
608 LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "NOT Assigned (" << CurrSU
.first->NodeNum << ")\n"; } } while (false)
;
609
610 bool FinishedExploring = false;
611 if (CurrCost < BestCost || BestCost == -1) {
612 if (solveExact()) {
613 bool FinishedExploring = BestCost != 0;
614 if (!FinishedExploring)
615 return true;
616 }
617 }
618
619 retreatPosition();
620 CurrCost -= MissPenalty;
621 return FinishedExploring;
622}
623
624bool PipelineSolver::solveGreedy() {
625 BestCost = 0;
626 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
627
628 while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
19
Assuming the condition is true
20
Loop condition is true. Entering loop body
629 SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
630 int BestNodeCost = -1;
631 int TempCost;
632 SchedGroup *BestGroup = nullptr;
633 int BestGroupID = -1;
634 auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
635 LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNumdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU
.first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx
<< "\n"; } } while (false)
21
Assuming 'DebugFlag' is false
22
Loop condition is false. Exiting loop
636 << ") in Pipeline # " << CurrSyncGroupIdx << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU
.first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx
<< "\n"; } } while (false)
;
637
638 // Since we have added the potential SchedGroups from bottom up, but
639 // traversed the DAG from top down, parse over the groups from last to
640 // first. If we fail to do this for the greedy algorithm, the solution will
641 // likely not be good in more complex cases.
642 auto I = CurrSU.second.rbegin();
643 auto E = CurrSU.second.rend();
644 for (; I != E; ++I) {
23
Loop condition is true. Entering loop body
645 std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
646 int CandSGID = *I;
647 SchedGroup *Match;
24
'Match' declared without an initial value
648 for (auto &SG : SyncPipeline) {
25
Assuming '__begin3' is equal to '__end3'
649 if (SG.getSGID() == CandSGID)
650 Match = &SG;
651 }
652
653 LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Trying SGID # " << CandSGID
<< " with Mask " << (int)Match->getMask() <<
"\n"; } } while (false)
26
Assuming 'DebugFlag' is false
27
Loop condition is false. Exiting loop
654 << (int)Match->getMask() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Trying SGID # " << CandSGID
<< " with Mask " << (int)Match->getMask() <<
"\n"; } } while (false)
;
655
656 if (Match->isFull()) {
28
Called C++ object pointer is uninitialized
657 LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "SGID # " << CandSGID <<
" is full\n"; } } while (false)
;
658 continue;
659 }
660 TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
661 LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Cost of Group " << TempCost
<< "\n"; } } while (false)
;
662 if (TempCost < BestNodeCost || BestNodeCost == -1) {
663 BestGroup = Match;
664 BestNodeCost = TempCost;
665 BestGroupID = CandSGID;
666 }
667 removeEdges(AddedEdges);
668 if (BestNodeCost == 0)
669 break;
670 }
671
672 if (BestGroupID != -1) {
673 BestGroup->add(*CurrSU.first);
674 addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
675 LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Best Group has ID: " <<
BestGroupID << " and Mask" << (int)BestGroup->
getMask() << "\n"; } } while (false)
676 << (int)BestGroup->getMask() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Best Group has ID: " <<
BestGroupID << " and Mask" << (int)BestGroup->
getMask() << "\n"; } } while (false)
;
677 BestCost += TempCost;
678 } else
679 BestCost += MissPenalty;
680
681 CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
682 advancePosition();
683 }
684 BestPipeline = CurrPipeline;
685 removeEdges(AddedEdges);
686 return false;
687}
688
689unsigned PipelineSolver::computeProblemSize() {
690 unsigned ProblemSize = 0;
691 for (auto &PipeConflicts : PipelineInstrs) {
692 ProblemSize += PipeConflicts.size();
693 }
694
695 return ProblemSize;
696}
697
698void PipelineSolver::solve() {
699 if (!NeedsSolver)
10
Assuming field 'NeedsSolver' is true
11
Taking false branch
700 return;
701
702 unsigned ProblemSize = computeProblemSize();
703 assert(ProblemSize > 0)(static_cast <bool> (ProblemSize > 0) ? void (0) : __assert_fail
("ProblemSize > 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 703, __extension__ __PRETTY_FUNCTION__))
;
12
Assuming 'ProblemSize' is > 0
13
'?' condition is true
704
705 bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact;
14
Assuming the condition is false
706 MissPenalty = (ProblemSize / 2) + 1;
707
708 LLVM_DEBUG(DAG->dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { DAG->dump(); } } while (false)
;
15
Assuming 'DebugFlag' is false
709 if (EnableExactSolver || BelowCutoff) {
16
Assuming the condition is true
710 LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Starting Greedy pipeline solver\n"
; } } while (false)
;
17
Loop condition is false. Exiting loop
711 solveGreedy();
18
Calling 'PipelineSolver::solveGreedy'
712 reset();
713 LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Greedy produced best cost of "
<< BestCost << "\n"; } } while (false)
;
714 if (BestCost > 0) {
715 LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Starting EXACT pipeline solver\n"
; } } while (false)
;
716 solveExact();
717 LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Exact produced best cost of "
<< BestCost << "\n"; } } while (false)
;
718 }
719 } else { // Use the Greedy Algorithm by default
720 LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Starting GREEDY pipeline solver\n"
; } } while (false)
;
721 solveGreedy();
722 }
723
724 makePipeline();
725}
726
727enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 };
728
729// Implement a IGLP scheduling strategy.
730class IGLPStrategy {
731protected:
732 ScheduleDAGInstrs *DAG;
733
734 const SIInstrInfo *TII;
735
736public:
737 // Add SchedGroups to \p Pipeline to implement this Strategy.
738 virtual void applyIGLPStrategy(
739 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
740 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
741
742 // Returns true if this strategy should be applied to a ScheduleDAG.
743 virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
744
745 IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
746 : DAG(DAG), TII(TII) {}
747
748 virtual ~IGLPStrategy() = default;
749};
750
751class MFMASmallGemmOpt final : public IGLPStrategy {
752public:
753 void applyIGLPStrategy(
754 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
755 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
756
757 bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
758
759 MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
760 : IGLPStrategy(DAG, TII) {}
761};
762
763void MFMASmallGemmOpt::applyIGLPStrategy(
764 DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
765 DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
766 // Count the number of MFMA instructions.
767 unsigned MFMACount = 0;
768 for (const MachineInstr &I : *DAG)
769 if (TII->isMFMA(I))
770 ++MFMACount;
771
772 const unsigned PipelineSyncID = 0;
773 SchedGroup *SG = nullptr;
774 for (unsigned I = 0; I < MFMACount; ++I) {
775 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
776 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
777 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
778
779 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
780 SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII);
781 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
782
783 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
784 SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
785 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
786
787 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
788 SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII);
789 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
790
791 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
792 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
793 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
794 }
795
796 for (unsigned I = 0; I < MFMACount; ++I) {
797 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
798 SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
799 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
800
801 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
802 SchedGroupMask::VMEM_READ, 1, PipelineSyncID, DAG, TII);
803 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
804
805 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
806 SchedGroupMask::VMEM_WRITE, 1, PipelineSyncID, DAG, TII);
807 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
808
809 SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
810 SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
811 SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
812 }
813}
814
815static std::unique_ptr<IGLPStrategy>
816createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
817 const SIInstrInfo *TII) {
818 switch (ID) {
819 case MFMASmallGemmOptID:
820 return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
821 }
822
823 llvm_unreachable("Unknown IGLPStrategyID")::llvm::llvm_unreachable_internal("Unknown IGLPStrategyID", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp"
, 823)
;
824}
825
826class IGroupLPDAGMutation : public ScheduleDAGMutation {
827private:
828 const SIInstrInfo *TII;
829
830 ScheduleDAGMI *DAG;
831
832 // Organize lists of SchedGroups by their SyncID. SchedGroups /
833 // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
834 // between then.
835 DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups;
836
837 // Used to track instructions that can be mapped to multiple sched groups
838 DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs;
839
840 // Add DAG edges that enforce SCHED_BARRIER ordering.
841 void addSchedBarrierEdges(SUnit &SU);
842
843 // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should
844 // not be reordered accross the SCHED_BARRIER. This is used for the base
845 // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that
846 // SCHED_BARRIER will always block all instructions that can be classified
847 // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size
848 // and may only synchronize with some SchedGroups. Returns the inverse of
849 // Mask. SCHED_BARRIER's mask describes which instruction types should be
850 // allowed to be scheduled across it. Invert the mask to get the
851 // SchedGroupMask of instructions that should be barred.
852 SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const;
853
854 // Create SchedGroups for a SCHED_GROUP_BARRIER.
855 void initSchedGroupBarrierPipelineStage(
856 std::vector<SUnit>::reverse_iterator RIter);
857
858 void initIGLPOpt(SUnit &SU);
859
860public:
861 void apply(ScheduleDAGInstrs *DAGInstrs) override;
862
863 IGroupLPDAGMutation() = default;
864};
865
866unsigned SchedGroup::NumSchedGroups = 0;
867
868bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) {
869 if (A != B && DAG->canAddEdge(B, A)) {
870 DAG->addEdge(B, SDep(A, SDep::Artificial));
871 return true;
872 }
873 return false;
874}
875
876bool SchedGroup::canAddMI(const MachineInstr &MI) const {
877 bool Result = false;
878 if (MI.isMetaInstruction())
879 Result = false;
880
881 else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
882 (TII->isVALU(MI) || TII->isMFMA(MI) || TII->isSALU(MI)))
883 Result = true;
884
885 else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
886 TII->isVALU(MI) && !TII->isMFMA(MI))
887 Result = true;
888
889 else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
890 TII->isSALU(MI))
891 Result = true;
892
893 else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) &&
894 TII->isMFMA(MI))
895 Result = true;
896
897 else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) &&
898 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
899 Result = true;
900
901 else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) &&
902 MI.mayLoad() &&
903 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
904 Result = true;
905
906 else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) &&
907 MI.mayStore() &&
908 (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI))))
909 Result = true;
910
911 else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) &&
912 TII->isDS(MI))
913 Result = true;
914
915 else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) &&
916 MI.mayLoad() && TII->isDS(MI))
917 Result = true;
918
919 else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) &&
920 MI.mayStore() && TII->isDS(MI))
921 Result = true;
922
923 LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << (Result ? " could classify "
: " unable to classify ") << MI; } } while (false)
924 dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << (Result ? " could classify "
: " unable to classify ") << MI; } } while (false)
925 << (Result ? " could classify " : " unable to classify ") << MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "For SchedGroup with mask " <<
format_hex((int)SGMask, 10, true) << (Result ? " could classify "
: " unable to classify ") << MI; } } while (false)
;
926
927 return Result;
928}
929
930int SchedGroup::link(SUnit &SU, bool MakePred,
931 std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
932 int MissedEdges = 0;
933 for (auto *A : Collection) {
934 SUnit *B = &SU;
935 if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
936 continue;
937 if (MakePred)
938 std::swap(A, B);
939
940 if (DAG->IsReachable(B, A))
941 continue;
942 // tryAddEdge returns false if there is a dependency that makes adding
943 // the A->B edge impossible, otherwise it returns true;
944 bool Added = tryAddEdge(A, B);
945 if (Added)
946 AddedEdges.push_back(std::make_pair(A, B));
947 else
948 ++MissedEdges;
949 }
950
951 return MissedEdges;
952}
953
954void SchedGroup::link(SUnit &SU, bool MakePred) {
955 for (auto *A : Collection) {
956 SUnit *B = &SU;
957 if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
958 continue;
959 if (MakePred)
960 std::swap(A, B);
961
962 tryAddEdge(A, B);
963 }
964}
965
966void SchedGroup::link(SUnit &SU,
967 function_ref<bool(const SUnit *A, const SUnit *B)> P) {
968 for (auto *A : Collection) {
969 SUnit *B = &SU;
970 if (P(A, B))
971 std::swap(A, B);
972
973 tryAddEdge(A, B);
974 }
975}
976
977void SchedGroup::link(SchedGroup &OtherGroup) {
978 for (auto *B : OtherGroup.Collection)
979 link(*B);
980}
981
982bool SchedGroup::canAddSU(SUnit &SU) const {
983 MachineInstr &MI = *SU.getInstr();
984 if (MI.getOpcode() != TargetOpcode::BUNDLE)
985 return canAddMI(MI);
986
987 // Special case for bundled MIs.
988 const MachineBasicBlock *MBB = MI.getParent();
989 MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B;
990 while (E != MBB->end() && E->isBundledWithPred())
991 ++E;
992
993 // Return true if all of the bundled MIs can be added to this group.
994 return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); });
995}
996
997void SchedGroup::initSchedGroup() {
998 for (auto &SU : DAG->SUnits) {
999 if (isFull())
1000 break;
1001
1002 if (canAddSU(SU))
1003 add(SU);
1004 }
1005}
1006
1007void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter,
1008 SUnitsToCandidateSGsMap &SyncedInstrs) {
1009 SUnit &InitSU = *RIter;
1010 for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) {
1011 auto &SU = *RIter;
1012 if (isFull())
1013 break;
1014
1015 if (canAddSU(SU))
1016 SyncedInstrs[&SU].push_back(SGID);
1017 }
1018
1019 add(InitSU);
1020 assert(MaxSize)(static_cast <bool> (MaxSize) ? void (0) : __assert_fail
("MaxSize", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 1020
, __extension__ __PRETTY_FUNCTION__))
;
1021 (*MaxSize)++;
1022}
1023
1024void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) {
1025 auto I = DAG->SUnits.rbegin();
1026 auto E = DAG->SUnits.rend();
1027 for (; I != E; ++I) {
1028 auto &SU = *I;
1029 if (isFull())
1030 break;
1031
1032 if (canAddSU(SU))
1033 SyncedInstrs[&SU].push_back(SGID);
1034 }
1035}
1036
1037void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
1038 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1039 if (!TSchedModel || DAGInstrs->SUnits.empty())
1
Assuming 'TSchedModel' is non-null
2
Assuming the condition is false
1040 return;
1041
1042 LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType
("igrouplp")) { dbgs() << "Applying IGroupLPDAGMutation...\n"
; } } while (false)
;
3
Taking false branch
4
Assuming 'DebugFlag' is false
5
Loop condition is false. Exiting loop
1043 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1044 TII = ST.getInstrInfo();
1045 DAG = static_cast<ScheduleDAGMI *>(DAGInstrs);
1046 SyncedSchedGroups.clear();
1047 SyncedInstrs.clear();
1048 bool foundSB = false;
1049 bool foundIGLP = false;
1050 for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) {
6
Loop condition is true. Entering loop body
1051 unsigned Opc = R->getInstr()->getOpcode();
1052 // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive.
1053 if (Opc == AMDGPU::SCHED_BARRIER) {
7
Assuming 'Opc' is equal to SCHED_BARRIER
8
Taking true branch
1054 addSchedBarrierEdges(*R);
1055 foundSB = true;
1056 } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
1057 initSchedGroupBarrierPipelineStage(R);
1058 foundSB = true;
1059 } else if (Opc == AMDGPU::IGLP_OPT) {
1060 resetEdges(*R, DAG);
1061 if (!foundSB && !foundIGLP)
1062 initIGLPOpt(*R);
1063 foundIGLP = true;
1064 }
1065 }
1066
1067 if (foundSB
8.1
'foundSB' is true
|| foundIGLP) {
1068 PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);
1069 // PipelineSolver performs the mutation by adding the edges it
1070 // determined as the best
1071 PS.solve();
9
Calling 'PipelineSolver::solve'
1072 }
1073}
1074
1075void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
1076 MachineInstr &MI = *SchedBarrier.getInstr();
1077 assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER)(static_cast <bool> (MI.getOpcode() == AMDGPU::SCHED_BARRIER
) ? void (0) : __assert_fail ("MI.getOpcode() == AMDGPU::SCHED_BARRIER"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 1077, __extension__
__PRETTY_FUNCTION__))
;
1078 // Remove all existing edges from the SCHED_BARRIER that were added due to the
1079 // instruction having side effects.
1080 resetEdges(SchedBarrier, DAG);
1081 auto InvertedMask =
1082 invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
1083 SchedGroup SG(InvertedMask, None, DAG, TII);
1084 SG.initSchedGroup();
1085 // Preserve original instruction ordering relative to the SCHED_BARRIER.
1086 SG.link(
1087 SchedBarrier,
1088 (function_ref<bool(const SUnit *A, const SUnit *B)>)[](
1089 const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; });
1090}
1091
1092SchedGroupMask
1093IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
1094 // Invert mask and erase bits for types of instructions that are implied to be
1095 // allowed past the SCHED_BARRIER.
1096 SchedGroupMask InvertedMask = ~Mask;
1097
1098 // ALU implies VALU, SALU, MFMA.
1099 if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
1100 InvertedMask &=
1101 ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA;
1102 // VALU, SALU, MFMA implies ALU.
1103 else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
1104 (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
1105 (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE)
1106 InvertedMask &= ~SchedGroupMask::ALU;
1107
1108 // VMEM implies VMEM_READ, VMEM_WRITE.
1109 if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE)
1110 InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE;
1111 // VMEM_READ, VMEM_WRITE implies VMEM.
1112 else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE ||
1113 (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE)
1114 InvertedMask &= ~SchedGroupMask::VMEM;
1115
1116 // DS implies DS_READ, DS_WRITE.
1117 if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE)
1118 InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE;
1119 // DS_READ, DS_WRITE implies DS.
1120 else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE ||
1121 (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
1122 InvertedMask &= ~SchedGroupMask::DS;
1123
1124 return InvertedMask;
1125}
1126
1127void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
1128 std::vector<SUnit>::reverse_iterator RIter) {
1129 // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
1130 // to the instruction having side effects.
1131 resetEdges(*RIter, DAG);
1132 MachineInstr &SGB = *RIter->getInstr();
1133 assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)(static_cast <bool> (SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER
) ? void (0) : __assert_fail ("SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER"
, "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 1133, __extension__
__PRETTY_FUNCTION__))
;
1134 int32_t SGMask = SGB.getOperand(0).getImm();
1135 int32_t Size = SGB.getOperand(1).getImm();
1136 int32_t SyncID = SGB.getOperand(2).getImm();
1137
1138 auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
1139 Size, SyncID, DAG, TII);
1140
1141 SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
1142}
1143
1144void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
1145 IGLPStrategyID StrategyID =
1146 (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
1147 auto S = createIGLPStrategy(StrategyID, DAG, TII);
1148 if (S->shouldApplyStrategy(DAG))
1149 S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
1150}
1151
1152} // namespace
1153
1154namespace llvm {
1155
1156std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
1157 return std::make_unique<IGroupLPDAGMutation>();
1158}
1159
1160} // end namespace llvm