| File: | build/source/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp |
| Warning: | line 569, column 9 Called C++ object pointer is uninitialized |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===--- AMDGPUIGroupLP.cpp - AMDGPU IGroupLP ------------===// | |||
| 2 | // | |||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | |||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
| 6 | // | |||
| 7 | //===----------------------------------------------------------------------===// | |||
| 8 | // | |||
| 9 | // \file This file defines a set of schedule DAG mutations that can be used to | |||
| 10 | // override default scheduler behavior to enforce specific scheduling patterns. | |||
| 11 | // They should be used in cases where runtime performance considerations such as | |||
| 12 | // inter-wavefront interactions, mean that compile-time heuristics cannot | |||
| 13 | // predict the optimal instruction ordering, or in kernels where optimum | |||
| 14 | // instruction scheduling is important enough to warrant manual intervention. | |||
| 15 | // | |||
| 16 | //===----------------------------------------------------------------------===// | |||
| 17 | ||||
| 18 | #include "AMDGPUIGroupLP.h" | |||
| 19 | #include "AMDGPUTargetMachine.h" | |||
| 20 | #include "MCTargetDesc/AMDGPUMCTargetDesc.h" | |||
| 21 | #include "SIInstrInfo.h" | |||
| 22 | #include "SIMachineFunctionInfo.h" | |||
| 23 | #include "llvm/ADT/BitmaskEnum.h" | |||
| 24 | #include "llvm/ADT/DenseMap.h" | |||
| 25 | #include "llvm/CodeGen/MachineScheduler.h" | |||
| 26 | #include "llvm/CodeGen/TargetOpcodes.h" | |||
| 27 | ||||
| 28 | using namespace llvm; | |||
| 29 | ||||
| 30 | #define DEBUG_TYPE"igrouplp" "igrouplp" | |||
| 31 | ||||
| 32 | namespace { | |||
| 33 | ||||
| 34 | static cl::opt<bool> EnableExactSolver( | |||
| 35 | "amdgpu-igrouplp-exact-solver", cl::Hidden, | |||
| 36 | cl::desc("Whether to use the exponential time solver to fit " | |||
| 37 | "the instructions to the pipeline as closely as " | |||
| 38 | "possible."), | |||
| 39 | cl::init(false)); | |||
| 40 | ||||
| 41 | static cl::opt<unsigned> CutoffForExact( | |||
| 42 | "amdgpu-igrouplp-exact-solver-cutoff", cl::init(0), cl::Hidden, | |||
| 43 | cl::desc("The maximum number of scheduling group conflicts " | |||
| 44 | "which we attempt to solve with the exponential time " | |||
| 45 | "exact solver. Problem sizes greater than this will" | |||
| 46 | "be solved by the less accurate greedy algorithm. Selecting " | |||
| 47 | "solver by size is superseded by manually selecting " | |||
| 48 | "the solver (e.g. by amdgpu-igrouplp-exact-solver")); | |||
| 49 | ||||
| 50 | static cl::opt<uint64_t> MaxBranchesExplored( | |||
| 51 | "amdgpu-igrouplp-exact-solver-max-branches", cl::init(0), cl::Hidden, | |||
| 52 | cl::desc("The amount of branches that we are willing to explore with" | |||
| 53 | "the exact algorithm before giving up.")); | |||
| 54 | ||||
| 55 | static cl::opt<bool> UseCostHeur( | |||
| 56 | "amdgpu-igrouplp-exact-solver-cost-heur", cl::init(true), cl::Hidden, | |||
| 57 | cl::desc("Whether to use the cost heuristic to make choices as we " | |||
| 58 | "traverse the search space using the exact solver. Defaulted " | |||
| 59 | "to on, and if turned off, we will use the node order -- " | |||
| 60 | "attempting to put the later nodes in the later sched groups. " | |||
| 61 | "Experimentally, results are mixed, so this should be set on a " | |||
| 62 | "case-by-case basis.")); | |||
| 63 | ||||
| 64 | // Components of the mask that determines which instruction types may be may be | |||
| 65 | // classified into a SchedGroup. | |||
| 66 | enum class SchedGroupMask { | |||
| 67 | NONE = 0u, | |||
| 68 | ALU = 1u << 0, | |||
| 69 | VALU = 1u << 1, | |||
| 70 | SALU = 1u << 2, | |||
| 71 | MFMA = 1u << 3, | |||
| 72 | VMEM = 1u << 4, | |||
| 73 | VMEM_READ = 1u << 5, | |||
| 74 | VMEM_WRITE = 1u << 6, | |||
| 75 | DS = 1u << 7, | |||
| 76 | DS_READ = 1u << 8, | |||
| 77 | DS_WRITE = 1u << 9, | |||
| 78 | ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | | |||
| 79 | DS_READ | DS_WRITE, | |||
| 80 | LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)LLVM_BITMASK_LARGEST_ENUMERATOR = ALL | |||
| 81 | }; | |||
| 82 | ||||
| 83 | typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap; | |||
| 84 | ||||
| 85 | // Classify instructions into groups to enable fine tuned control over the | |||
| 86 | // scheduler. These groups may be more specific than current SchedModel | |||
| 87 | // instruction classes. | |||
| 88 | class SchedGroup { | |||
| 89 | private: | |||
| 90 | // Mask that defines which instruction types can be classified into this | |||
| 91 | // SchedGroup. The instruction types correspond to the mask from SCHED_BARRIER | |||
| 92 | // and SCHED_GROUP_BARRIER. | |||
| 93 | SchedGroupMask SGMask; | |||
| 94 | ||||
| 95 | // Maximum number of SUnits that can be added to this group. | |||
| 96 | std::optional<unsigned> MaxSize; | |||
| 97 | ||||
| 98 | // SchedGroups will only synchronize with other SchedGroups that have the same | |||
| 99 | // SyncID. | |||
| 100 | int SyncID = 0; | |||
| 101 | ||||
| 102 | // SGID is used to map instructions to candidate SchedGroups | |||
| 103 | unsigned SGID; | |||
| 104 | ||||
| 105 | // Count of the number of created SchedGroups, used to initialize SGID. | |||
| 106 | static unsigned NumSchedGroups; | |||
| 107 | ||||
| 108 | ScheduleDAGInstrs *DAG; | |||
| 109 | ||||
| 110 | const SIInstrInfo *TII; | |||
| 111 | ||||
| 112 | // Try to add and edge from SU A to SU B. | |||
| 113 | bool tryAddEdge(SUnit *A, SUnit *B); | |||
| 114 | ||||
| 115 | // Use SGMask to determine whether we can classify MI as a member of this | |||
| 116 | // SchedGroup object. | |||
| 117 | bool canAddMI(const MachineInstr &MI) const; | |||
| 118 | ||||
| 119 | public: | |||
| 120 | // Collection of SUnits that are classified as members of this group. | |||
| 121 | SmallVector<SUnit *, 32> Collection; | |||
| 122 | ||||
| 123 | // Returns true if SU can be added to this SchedGroup. | |||
| 124 | bool canAddSU(SUnit &SU) const; | |||
| 125 | ||||
| 126 | // Add DAG dependencies from all SUnits in this SchedGroup and this SU. If | |||
| 127 | // MakePred is true, SU will be a predecessor of the SUnits in this | |||
| 128 | // SchedGroup, otherwise SU will be a successor. | |||
| 129 | void link(SUnit &SU, bool MakePred = false); | |||
| 130 | ||||
| 131 | // Add DAG dependencies and track which edges are added, and the count of | |||
| 132 | // missed edges | |||
| 133 | int link(SUnit &SU, bool MakePred, | |||
| 134 | std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); | |||
| 135 | ||||
| 136 | // Add DAG dependencies from all SUnits in this SchedGroup and this SU. | |||
| 137 | // Use the predicate to determine whether SU should be a predecessor (P = | |||
| 138 | // true) or a successor (P = false) of this SchedGroup. | |||
| 139 | void link(SUnit &SU, function_ref<bool(const SUnit *A, const SUnit *B)> P); | |||
| 140 | ||||
| 141 | // Add DAG dependencies such that SUnits in this group shall be ordered | |||
| 142 | // before SUnits in OtherGroup. | |||
| 143 | void link(SchedGroup &OtherGroup); | |||
| 144 | ||||
| 145 | // Returns true if no more instructions may be added to this group. | |||
| 146 | bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; } | |||
| 147 | ||||
| 148 | // Add SU to the SchedGroup. | |||
| 149 | void add(SUnit &SU) { | |||
| 150 | LLVM_DEBUG(dbgs() << "For SchedGroup with mask "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << " adding " << *SU.getInstr(); } } while (false) | |||
| 151 | << format_hex((int)SGMask, 10, true) << " adding "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << " adding " << *SU.getInstr(); } } while (false) | |||
| 152 | << *SU.getInstr())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << " adding " << *SU.getInstr(); } } while (false); | |||
| 153 | Collection.push_back(&SU); | |||
| 154 | } | |||
| 155 | ||||
| 156 | // Remove last element in the SchedGroup | |||
| 157 | void pop() { Collection.pop_back(); } | |||
| 158 | ||||
| 159 | // Identify and add all relevant SUs from the DAG to this SchedGroup. | |||
| 160 | void initSchedGroup(); | |||
| 161 | ||||
| 162 | // Add instructions to the SchedGroup bottom up starting from RIter. | |||
| 163 | // PipelineInstrs is a set of instructions that should not be added to the | |||
| 164 | // SchedGroup even when the other conditions for adding it are satisfied. | |||
| 165 | // RIter will be added to the SchedGroup as well, and dependencies will be | |||
| 166 | // added so that RIter will always be scheduled at the end of the group. | |||
| 167 | void initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, | |||
| 168 | SUnitsToCandidateSGsMap &SyncedInstrs); | |||
| 169 | ||||
| 170 | void initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs); | |||
| 171 | ||||
| 172 | int getSyncID() { return SyncID; } | |||
| 173 | ||||
| 174 | int getSGID() { return SGID; } | |||
| 175 | ||||
| 176 | SchedGroupMask getMask() { return SGMask; } | |||
| 177 | ||||
| 178 | SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, | |||
| 179 | ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) | |||
| 180 | : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) { | |||
| 181 | SGID = NumSchedGroups++; | |||
| 182 | } | |||
| 183 | ||||
| 184 | SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID, | |||
| 185 | ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) | |||
| 186 | : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) { | |||
| 187 | SGID = NumSchedGroups++; | |||
| 188 | } | |||
| 189 | }; | |||
| 190 | ||||
| 191 | // Remove all existing edges from a SCHED_BARRIER or SCHED_GROUP_BARRIER. | |||
| 192 | static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) { | |||
| 193 | assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||(static_cast <bool> (SU.getInstr()->getOpcode() == AMDGPU ::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT) ? void (0) : __assert_fail ("SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT" , "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 195, __extension__ __PRETTY_FUNCTION__)) | |||
| 194 | SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||(static_cast <bool> (SU.getInstr()->getOpcode() == AMDGPU ::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT) ? void (0) : __assert_fail ("SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT" , "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 195, __extension__ __PRETTY_FUNCTION__)) | |||
| 195 | SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT)(static_cast <bool> (SU.getInstr()->getOpcode() == AMDGPU ::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT) ? void (0) : __assert_fail ("SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER || SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT" , "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 195, __extension__ __PRETTY_FUNCTION__)); | |||
| 196 | ||||
| 197 | while (!SU.Preds.empty()) | |||
| 198 | for (auto &P : SU.Preds) | |||
| 199 | SU.removePred(P); | |||
| 200 | ||||
| 201 | while (!SU.Succs.empty()) | |||
| 202 | for (auto &S : SU.Succs) | |||
| 203 | for (auto &SP : S.getSUnit()->Preds) | |||
| 204 | if (SP.getSUnit() == &SU) | |||
| 205 | S.getSUnit()->removePred(SP); | |||
| 206 | } | |||
| 207 | ||||
| 208 | typedef std::pair<SUnit *, SmallVector<int, 4>> SUToCandSGsPair; | |||
| 209 | typedef SmallVector<SUToCandSGsPair, 4> SUsToCandSGsVec; | |||
| 210 | ||||
| 211 | // The PipelineSolver is used to assign SUnits to SchedGroups in a pipeline | |||
| 212 | // in non-trivial cases. For example, if the requested pipeline is | |||
| 213 | // {VMEM_READ, VALU, MFMA, VMEM_READ} and we encounter a VMEM_READ instruction | |||
| 214 | // in the DAG, then we will have an instruction that can not be trivially | |||
| 215 | // assigned to a SchedGroup. The PipelineSolver class implements two algorithms | |||
| 216 | // to find a good solution to the pipeline -- a greedy algorithm and an exact | |||
| 217 | // algorithm. The exact algorithm has an exponential time complexity and should | |||
| 218 | // only be used for small sized problems or medium sized problems where an exact | |||
| 219 | // solution is highly desired. | |||
| 220 | class PipelineSolver { | |||
| 221 | ScheduleDAGMI *DAG; | |||
| 222 | ||||
| 223 | // Instructions that can be assigned to multiple SchedGroups | |||
| 224 | DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; | |||
| 225 | SmallVector<SUsToCandSGsVec, 4> PipelineInstrs; | |||
| 226 | DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; | |||
| 227 | // The current working pipeline | |||
| 228 | SmallVector<SmallVector<SchedGroup, 4>, 4> CurrPipeline; | |||
| 229 | // The pipeline that has the best solution found so far | |||
| 230 | SmallVector<SmallVector<SchedGroup, 4>, 4> BestPipeline; | |||
| 231 | ||||
| 232 | // Whether or not we actually have any SyncedInstrs to try to solve. | |||
| 233 | bool NeedsSolver = false; | |||
| 234 | ||||
| 235 | // Compute an estimate of the size of search tree -- the true size is | |||
| 236 | // the product of each conflictedInst.Matches.size() across all SyncPipelines | |||
| 237 | unsigned computeProblemSize(); | |||
| 238 | ||||
| 239 | // The cost penalty of not assigning a SU to a SchedGroup | |||
| 240 | int MissPenalty = 0; | |||
| 241 | ||||
| 242 | // Costs in terms of the number of edges we are unable to add | |||
| 243 | int BestCost = -1; | |||
| 244 | int CurrCost = 0; | |||
| 245 | ||||
| 246 | // Index pointing to the conflicting instruction that is currently being | |||
| 247 | // fitted | |||
| 248 | int CurrConflInstNo = 0; | |||
| 249 | // Index to the pipeline that is currently being fitted | |||
| 250 | int CurrSyncGroupIdx = 0; | |||
| 251 | // The first non trivial pipeline | |||
| 252 | int BeginSyncGroupIdx = 0; | |||
| 253 | ||||
| 254 | // How many branches we have explored | |||
| 255 | uint64_t BranchesExplored = 0; | |||
| 256 | ||||
| 257 | // Update indices to fit next conflicting instruction | |||
| 258 | void advancePosition(); | |||
| 259 | // Recede indices to attempt to find better fit for previous conflicting | |||
| 260 | // instruction | |||
| 261 | void retreatPosition(); | |||
| 262 | ||||
| 263 | // The exponential time algorithm which finds the provably best fit | |||
| 264 | bool solveExact(); | |||
| 265 | // The polynomial time algorithm which attempts to find a good fit | |||
| 266 | bool solveGreedy(); | |||
| 267 | // Whether or not the current solution is optimal | |||
| 268 | bool checkOptimal(); | |||
| 269 | // Populate the ready list, prioiritizing fewest missed edges first | |||
| 270 | void populateReadyList(SUToCandSGsPair &CurrSU, | |||
| 271 | SmallVectorImpl<std::pair<int, int>> &ReadyList, | |||
| 272 | SmallVectorImpl<SchedGroup> &SyncPipeline); | |||
| 273 | // Add edges corresponding to the SchedGroups as assigned by solver | |||
| 274 | void makePipeline(); | |||
| 275 | // Add the edges from the SU to the other SchedGroups in pipeline, and | |||
| 276 | // return the number of edges missed. | |||
| 277 | int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, | |||
| 278 | std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); | |||
| 279 | // Remove the edges passed via AddedEdges | |||
| 280 | void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges); | |||
| 281 | // Convert the passed in maps to arrays for bidirectional iterators | |||
| 282 | void convertSyncMapsToArrays(); | |||
| 283 | ||||
| 284 | void reset(); | |||
| 285 | ||||
| 286 | public: | |||
| 287 | // Invoke the solver to map instructions to instruction groups. Heuristic && | |||
| 288 | // command-line-option determines to use exact or greedy algorithm. | |||
| 289 | void solve(); | |||
| 290 | ||||
| 291 | PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups, | |||
| 292 | DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, | |||
| 293 | ScheduleDAGMI *DAG) | |||
| 294 | : DAG(DAG), SyncedInstrs(SyncedInstrs), | |||
| 295 | SyncedSchedGroups(SyncedSchedGroups) { | |||
| 296 | ||||
| 297 | for (auto &PipelineInstrs : SyncedInstrs) { | |||
| 298 | if (PipelineInstrs.second.size() > 0) { | |||
| 299 | NeedsSolver = true; | |||
| 300 | break; | |||
| 301 | } | |||
| 302 | } | |||
| 303 | ||||
| 304 | if (!NeedsSolver) | |||
| 305 | return; | |||
| 306 | ||||
| 307 | convertSyncMapsToArrays(); | |||
| 308 | ||||
| 309 | CurrPipeline = BestPipeline; | |||
| 310 | ||||
| 311 | while (static_cast<size_t>(BeginSyncGroupIdx) < PipelineInstrs.size() && | |||
| 312 | PipelineInstrs[BeginSyncGroupIdx].size() == 0) | |||
| 313 | ++BeginSyncGroupIdx; | |||
| 314 | ||||
| 315 | if (static_cast<size_t>(BeginSyncGroupIdx) >= PipelineInstrs.size()) | |||
| 316 | return; | |||
| 317 | } | |||
| 318 | }; | |||
| 319 | ||||
| 320 | void PipelineSolver::reset() { | |||
| 321 | ||||
| 322 | for (auto &SyncPipeline : CurrPipeline) { | |||
| 323 | for (auto &SG : SyncPipeline) { | |||
| 324 | SmallVector<SUnit *, 32> TempCollection = SG.Collection; | |||
| 325 | SG.Collection.clear(); | |||
| 326 | auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) { | |||
| 327 | return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER; | |||
| 328 | }); | |||
| 329 | if (SchedBarr != TempCollection.end()) | |||
| 330 | SG.Collection.push_back(*SchedBarr); | |||
| 331 | } | |||
| 332 | } | |||
| 333 | ||||
| 334 | CurrSyncGroupIdx = BeginSyncGroupIdx; | |||
| 335 | CurrConflInstNo = 0; | |||
| 336 | CurrCost = 0; | |||
| 337 | } | |||
| 338 | ||||
| 339 | void PipelineSolver::convertSyncMapsToArrays() { | |||
| 340 | for (auto &SyncPipe : SyncedSchedGroups) { | |||
| 341 | BestPipeline.insert(BestPipeline.begin(), SyncPipe.second); | |||
| 342 | } | |||
| 343 | ||||
| 344 | int PipelineIDx = SyncedInstrs.size() - 1; | |||
| 345 | PipelineInstrs.resize(SyncedInstrs.size()); | |||
| 346 | for (auto &SyncInstrMap : SyncedInstrs) { | |||
| 347 | for (auto &SUsToCandSGs : SyncInstrMap.second) { | |||
| 348 | if (PipelineInstrs[PipelineIDx].size() == 0) { | |||
| 349 | PipelineInstrs[PipelineIDx].push_back( | |||
| 350 | std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); | |||
| 351 | continue; | |||
| 352 | } | |||
| 353 | auto SortPosition = PipelineInstrs[PipelineIDx].begin(); | |||
| 354 | // Insert them in sorted order -- this allows for good parsing order in | |||
| 355 | // the greedy algorithm | |||
| 356 | while (SortPosition != PipelineInstrs[PipelineIDx].end() && | |||
| 357 | SUsToCandSGs.first->NodeNum > SortPosition->first->NodeNum) | |||
| 358 | ++SortPosition; | |||
| 359 | PipelineInstrs[PipelineIDx].insert( | |||
| 360 | SortPosition, std::pair(SUsToCandSGs.first, SUsToCandSGs.second)); | |||
| 361 | } | |||
| 362 | --PipelineIDx; | |||
| 363 | } | |||
| 364 | } | |||
| 365 | ||||
| 366 | void PipelineSolver::makePipeline() { | |||
| 367 | // Preserve the order of barrier for subsequent SchedGroupBarrier mutations | |||
| 368 | for (auto &SyncPipeline : BestPipeline) { | |||
| 369 | for (auto &SG : SyncPipeline) { | |||
| 370 | SUnit *SGBarr = nullptr; | |||
| 371 | for (auto &SU : SG.Collection) { | |||
| 372 | if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) | |||
| 373 | SGBarr = SU; | |||
| 374 | } | |||
| 375 | // Command line requested IGroupLP doesn't have SGBarr | |||
| 376 | if (!SGBarr) | |||
| 377 | continue; | |||
| 378 | resetEdges(*SGBarr, DAG); | |||
| 379 | SG.link(*SGBarr, false); | |||
| 380 | } | |||
| 381 | } | |||
| 382 | ||||
| 383 | for (auto &SyncPipeline : BestPipeline) { | |||
| 384 | auto I = SyncPipeline.rbegin(); | |||
| 385 | auto E = SyncPipeline.rend(); | |||
| 386 | for (; I != E; ++I) { | |||
| 387 | auto &GroupA = *I; | |||
| 388 | for (auto J = std::next(I); J != E; ++J) { | |||
| 389 | auto &GroupB = *J; | |||
| 390 | GroupA.link(GroupB); | |||
| 391 | } | |||
| 392 | } | |||
| 393 | } | |||
| 394 | } | |||
| 395 | ||||
| 396 | int PipelineSolver::addEdges( | |||
| 397 | SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID, | |||
| 398 | std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { | |||
| 399 | int AddedCost = 0; | |||
| 400 | bool MakePred = false; | |||
| 401 | ||||
| 402 | // The groups in the pipeline are in reverse order. Thus, | |||
| 403 | // by traversing them from last to first, we are traversing | |||
| 404 | // them in the order as they were introduced in the code. After we | |||
| 405 | // pass the group the SU is being assigned to, it should be | |||
| 406 | // linked as a predecessor of the subsequent SchedGroups | |||
| 407 | auto GroupNo = (int)SyncPipeline.size() - 1; | |||
| 408 | for (; GroupNo >= 0; GroupNo--) { | |||
| 409 | if (SyncPipeline[GroupNo].getSGID() == SGID) { | |||
| 410 | MakePred = true; | |||
| 411 | continue; | |||
| 412 | } | |||
| 413 | auto Group = &SyncPipeline[GroupNo]; | |||
| 414 | AddedCost += Group->link(*SU, MakePred, AddedEdges); | |||
| 415 | assert(AddedCost >= 0)(static_cast <bool> (AddedCost >= 0) ? void (0) : __assert_fail ("AddedCost >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp" , 415, __extension__ __PRETTY_FUNCTION__)); | |||
| 416 | } | |||
| 417 | ||||
| 418 | return AddedCost; | |||
| 419 | } | |||
| 420 | ||||
| 421 | void PipelineSolver::removeEdges( | |||
| 422 | const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) { | |||
| 423 | // Only remove the edges that we have added when testing | |||
| 424 | // the fit. | |||
| 425 | for (auto &PredSuccPair : EdgesToRemove) { | |||
| 426 | SUnit *Pred = PredSuccPair.first; | |||
| 427 | SUnit *Succ = PredSuccPair.second; | |||
| 428 | ||||
| 429 | auto Match = llvm::find_if( | |||
| 430 | Succ->Preds, [&Pred](SDep &P) { return P.getSUnit() == Pred; }); | |||
| 431 | if (Match != Succ->Preds.end()) { | |||
| 432 | assert(Match->isArtificial())(static_cast <bool> (Match->isArtificial()) ? void ( 0) : __assert_fail ("Match->isArtificial()", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp" , 432, __extension__ __PRETTY_FUNCTION__)); | |||
| 433 | Succ->removePred(*Match); | |||
| 434 | } | |||
| 435 | } | |||
| 436 | } | |||
| 437 | ||||
| 438 | void PipelineSolver::advancePosition() { | |||
| 439 | ++CurrConflInstNo; | |||
| 440 | ||||
| 441 | if (static_cast<size_t>(CurrConflInstNo) >= | |||
| 442 | PipelineInstrs[CurrSyncGroupIdx].size()) { | |||
| 443 | CurrConflInstNo = 0; | |||
| 444 | ++CurrSyncGroupIdx; | |||
| 445 | // Advance to next non-trivial pipeline | |||
| 446 | while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size() && | |||
| 447 | PipelineInstrs[CurrSyncGroupIdx].size() == 0) | |||
| 448 | ++CurrSyncGroupIdx; | |||
| 449 | } | |||
| 450 | } | |||
| 451 | ||||
| 452 | void PipelineSolver::retreatPosition() { | |||
| 453 | assert(CurrConflInstNo >= 0)(static_cast <bool> (CurrConflInstNo >= 0) ? void (0 ) : __assert_fail ("CurrConflInstNo >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp" , 453, __extension__ __PRETTY_FUNCTION__)); | |||
| 454 | assert(CurrSyncGroupIdx >= 0)(static_cast <bool> (CurrSyncGroupIdx >= 0) ? void ( 0) : __assert_fail ("CurrSyncGroupIdx >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp" , 454, __extension__ __PRETTY_FUNCTION__)); | |||
| 455 | ||||
| 456 | if (CurrConflInstNo > 0) { | |||
| 457 | --CurrConflInstNo; | |||
| 458 | return; | |||
| 459 | } | |||
| 460 | ||||
| 461 | if (CurrConflInstNo == 0) { | |||
| 462 | // If we return to the starting position, we have explored | |||
| 463 | // the entire tree | |||
| 464 | if (CurrSyncGroupIdx == BeginSyncGroupIdx) | |||
| 465 | return; | |||
| 466 | ||||
| 467 | --CurrSyncGroupIdx; | |||
| 468 | // Go to previous non-trivial pipeline | |||
| 469 | while (PipelineInstrs[CurrSyncGroupIdx].size() == 0) | |||
| 470 | --CurrSyncGroupIdx; | |||
| 471 | ||||
| 472 | CurrConflInstNo = PipelineInstrs[CurrSyncGroupIdx].size() - 1; | |||
| 473 | } | |||
| 474 | } | |||
| 475 | ||||
| 476 | bool PipelineSolver::checkOptimal() { | |||
| 477 | if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) { | |||
| 478 | if (BestCost == -1 || CurrCost < BestCost) { | |||
| 479 | BestPipeline = CurrPipeline; | |||
| 480 | BestCost = CurrCost; | |||
| 481 | LLVM_DEBUG(dbgs() << "Found Fit with cost " << BestCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Found Fit with cost " << BestCost << "\n"; } } while (false); | |||
| 482 | } | |||
| 483 | assert(BestCost >= 0)(static_cast <bool> (BestCost >= 0) ? void (0) : __assert_fail ("BestCost >= 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp" , 483, __extension__ __PRETTY_FUNCTION__)); | |||
| 484 | } | |||
| 485 | ||||
| 486 | bool DoneExploring = false; | |||
| 487 | if (MaxBranchesExplored > 0 && BranchesExplored >= MaxBranchesExplored) | |||
| 488 | DoneExploring = true; | |||
| 489 | ||||
| 490 | return (DoneExploring || BestCost == 0); | |||
| 491 | } | |||
| 492 | ||||
| 493 | void PipelineSolver::populateReadyList( | |||
| 494 | SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList, | |||
| 495 | SmallVectorImpl<SchedGroup> &SyncPipeline) { | |||
| 496 | assert(CurrSU.second.size() >= 1)(static_cast <bool> (CurrSU.second.size() >= 1) ? void (0) : __assert_fail ("CurrSU.second.size() >= 1", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp" , 496, __extension__ __PRETTY_FUNCTION__)); | |||
| 497 | auto I = CurrSU.second.rbegin(); | |||
| 498 | auto E = CurrSU.second.rend(); | |||
| 499 | for (; I != E; ++I) { | |||
| 500 | std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; | |||
| 501 | int CandSGID = *I; | |||
| 502 | SchedGroup *Match; | |||
| 503 | for (auto &SG : SyncPipeline) { | |||
| 504 | if (SG.getSGID() == CandSGID) | |||
| 505 | Match = &SG; | |||
| 506 | } | |||
| 507 | ||||
| 508 | if (UseCostHeur) { | |||
| 509 | if (Match->isFull()) { | |||
| 510 | ReadyList.push_back(std::pair(*I, MissPenalty)); | |||
| 511 | continue; | |||
| 512 | } | |||
| 513 | ||||
| 514 | int TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); | |||
| 515 | ReadyList.push_back(std::pair(*I, TempCost)); | |||
| 516 | removeEdges(AddedEdges); | |||
| 517 | } else | |||
| 518 | ReadyList.push_back(std::pair(*I, -1)); | |||
| 519 | } | |||
| 520 | ||||
| 521 | if (UseCostHeur) { | |||
| 522 | std::sort(ReadyList.begin(), ReadyList.end(), | |||
| 523 | [](std::pair<int, int> A, std::pair<int, int> B) { | |||
| 524 | return A.second < B.second; | |||
| 525 | }); | |||
| 526 | } | |||
| 527 | ||||
| 528 | assert(ReadyList.size() == CurrSU.second.size())(static_cast <bool> (ReadyList.size() == CurrSU.second. size()) ? void (0) : __assert_fail ("ReadyList.size() == CurrSU.second.size()" , "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 528, __extension__ __PRETTY_FUNCTION__)); | |||
| 529 | } | |||
| 530 | ||||
| 531 | bool PipelineSolver::solveExact() { | |||
| 532 | if (checkOptimal()) | |||
| 533 | return true; | |||
| 534 | ||||
| 535 | if (static_cast<size_t>(CurrSyncGroupIdx) == PipelineInstrs.size()) | |||
| 536 | return false; | |||
| 537 | ||||
| 538 | assert(static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size())(static_cast <bool> (static_cast<size_t>(CurrSyncGroupIdx ) < PipelineInstrs.size()) ? void (0) : __assert_fail ("static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()" , "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 538, __extension__ __PRETTY_FUNCTION__)); | |||
| 539 | assert(static_cast<size_t>(CurrConflInstNo) <(static_cast <bool> (static_cast<size_t>(CurrConflInstNo ) < PipelineInstrs[CurrSyncGroupIdx].size()) ? void (0) : __assert_fail ("static_cast<size_t>(CurrConflInstNo) < PipelineInstrs[CurrSyncGroupIdx].size()" , "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 540, __extension__ __PRETTY_FUNCTION__)) | |||
| 540 | PipelineInstrs[CurrSyncGroupIdx].size())(static_cast <bool> (static_cast<size_t>(CurrConflInstNo ) < PipelineInstrs[CurrSyncGroupIdx].size()) ? void (0) : __assert_fail ("static_cast<size_t>(CurrConflInstNo) < PipelineInstrs[CurrSyncGroupIdx].size()" , "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 540, __extension__ __PRETTY_FUNCTION__)); | |||
| 541 | SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; | |||
| 542 | LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNumdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU .first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx << "\n"; } } while (false) | |||
| 543 | << ") in Pipeline # " << CurrSyncGroupIdx << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU .first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx << "\n"; } } while (false); | |||
| 544 | ||||
| 545 | // SchedGroup -> Cost pairs | |||
| 546 | SmallVector<std::pair<int, int>, 4> ReadyList; | |||
| 547 | // Prioritize the candidate sched groups in terms of lowest cost first | |||
| 548 | populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]); | |||
| 549 | ||||
| 550 | auto I = ReadyList.begin(); | |||
| 551 | auto E = ReadyList.end(); | |||
| 552 | for (; I != E; ++I) { | |||
| 553 | // If we are trying SGs in least cost order, and the current SG is cost | |||
| 554 | // infeasible, then all subsequent SGs will also be cost infeasible, so we | |||
| 555 | // can prune. | |||
| 556 | if (BestCost != -1 && (CurrCost + I->second > BestCost)) | |||
| 557 | return false; | |||
| 558 | ||||
| 559 | int CandSGID = I->first; | |||
| 560 | int AddedCost = 0; | |||
| 561 | std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; | |||
| 562 | auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; | |||
| 563 | SchedGroup *Match; | |||
| 564 | for (auto &SG : SyncPipeline) { | |||
| 565 | if (SG.getSGID() == CandSGID) | |||
| 566 | Match = &SG; | |||
| 567 | } | |||
| 568 | ||||
| 569 | if (Match->isFull()) | |||
| ||||
| 570 | continue; | |||
| 571 | ||||
| 572 | LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Assigning to SchedGroup with Mask " << (int)Match->getMask() << "and ID " << CandSGID << "\n"; } } while (false) | |||
| 573 | << (int)Match->getMask() << "and ID " << CandSGIDdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Assigning to SchedGroup with Mask " << (int)Match->getMask() << "and ID " << CandSGID << "\n"; } } while (false) | |||
| 574 | << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Assigning to SchedGroup with Mask " << (int)Match->getMask() << "and ID " << CandSGID << "\n"; } } while (false); | |||
| 575 | Match->add(*CurrSU.first); | |||
| 576 | AddedCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); | |||
| 577 | LLVM_DEBUG(dbgs() << "Cost of Assignment: " << AddedCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Cost of Assignment: " << AddedCost << "\n"; } } while (false); | |||
| 578 | CurrCost += AddedCost; | |||
| 579 | advancePosition(); | |||
| 580 | ++BranchesExplored; | |||
| 581 | bool FinishedExploring = false; | |||
| 582 | // If the Cost after adding edges is greater than a known solution, | |||
| 583 | // backtrack | |||
| 584 | if (CurrCost < BestCost || BestCost == -1) { | |||
| 585 | if (solveExact()) { | |||
| 586 | FinishedExploring = BestCost != 0; | |||
| 587 | if (!FinishedExploring) | |||
| 588 | return true; | |||
| 589 | } | |||
| 590 | } | |||
| 591 | ||||
| 592 | retreatPosition(); | |||
| 593 | CurrCost -= AddedCost; | |||
| 594 | removeEdges(AddedEdges); | |||
| 595 | Match->pop(); | |||
| 596 | CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; | |||
| 597 | if (FinishedExploring) | |||
| 598 | return true; | |||
| 599 | } | |||
| 600 | ||||
| 601 | // Try the pipeline where the current instruction is omitted | |||
| 602 | // Potentially if we omit a problematic instruction from the pipeline, | |||
| 603 | // all the other instructions can nicely fit. | |||
| 604 | CurrCost += MissPenalty; | |||
| 605 | advancePosition(); | |||
| 606 | ||||
| 607 | LLVM_DEBUG(dbgs() << "NOT Assigned (" << CurrSU.first->NodeNum << ")\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "NOT Assigned (" << CurrSU .first->NodeNum << ")\n"; } } while (false); | |||
| 608 | ||||
| 609 | bool FinishedExploring = false; | |||
| 610 | if (CurrCost < BestCost || BestCost == -1) { | |||
| 611 | if (solveExact()) { | |||
| 612 | bool FinishedExploring = BestCost != 0; | |||
| 613 | if (!FinishedExploring) | |||
| 614 | return true; | |||
| 615 | } | |||
| 616 | } | |||
| 617 | ||||
| 618 | retreatPosition(); | |||
| 619 | CurrCost -= MissPenalty; | |||
| 620 | return FinishedExploring; | |||
| 621 | } | |||
| 622 | ||||
| 623 | bool PipelineSolver::solveGreedy() { | |||
| 624 | BestCost = 0; | |||
| 625 | std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; | |||
| 626 | ||||
| 627 | while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) { | |||
| 628 | SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo]; | |||
| 629 | int BestNodeCost = -1; | |||
| 630 | int TempCost; | |||
| 631 | SchedGroup *BestGroup = nullptr; | |||
| 632 | int BestGroupID = -1; | |||
| 633 | auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx]; | |||
| 634 | LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNumdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU .first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx << "\n"; } } while (false) | |||
| 635 | << ") in Pipeline # " << CurrSyncGroupIdx << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Fitting SU(" << CurrSU .first->NodeNum << ") in Pipeline # " << CurrSyncGroupIdx << "\n"; } } while (false); | |||
| 636 | ||||
| 637 | // Since we have added the potential SchedGroups from bottom up, but | |||
| 638 | // traversed the DAG from top down, parse over the groups from last to | |||
| 639 | // first. If we fail to do this for the greedy algorithm, the solution will | |||
| 640 | // likely not be good in more complex cases. | |||
| 641 | auto I = CurrSU.second.rbegin(); | |||
| 642 | auto E = CurrSU.second.rend(); | |||
| 643 | for (; I != E; ++I) { | |||
| 644 | std::vector<std::pair<SUnit *, SUnit *>> AddedEdges; | |||
| 645 | int CandSGID = *I; | |||
| 646 | SchedGroup *Match; | |||
| 647 | for (auto &SG : SyncPipeline) { | |||
| 648 | if (SG.getSGID() == CandSGID) | |||
| 649 | Match = &SG; | |||
| 650 | } | |||
| 651 | ||||
| 652 | LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Trying SGID # " << CandSGID << " with Mask " << (int)Match->getMask() << "\n"; } } while (false) | |||
| 653 | << (int)Match->getMask() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Trying SGID # " << CandSGID << " with Mask " << (int)Match->getMask() << "\n"; } } while (false); | |||
| 654 | ||||
| 655 | if (Match->isFull()) { | |||
| 656 | LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "SGID # " << CandSGID << " is full\n"; } } while (false); | |||
| 657 | continue; | |||
| 658 | } | |||
| 659 | TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges); | |||
| 660 | LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Cost of Group " << TempCost << "\n"; } } while (false); | |||
| 661 | if (TempCost < BestNodeCost || BestNodeCost == -1) { | |||
| 662 | BestGroup = Match; | |||
| 663 | BestNodeCost = TempCost; | |||
| 664 | BestGroupID = CandSGID; | |||
| 665 | } | |||
| 666 | removeEdges(AddedEdges); | |||
| 667 | if (BestNodeCost == 0) | |||
| 668 | break; | |||
| 669 | } | |||
| 670 | ||||
| 671 | if (BestGroupID != -1) { | |||
| 672 | BestGroup->add(*CurrSU.first); | |||
| 673 | addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges); | |||
| 674 | LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" << (int)BestGroup-> getMask() << "\n"; } } while (false) | |||
| 675 | << (int)BestGroup->getMask() << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Best Group has ID: " << BestGroupID << " and Mask" << (int)BestGroup-> getMask() << "\n"; } } while (false); | |||
| 676 | BestCost += TempCost; | |||
| 677 | } else | |||
| 678 | BestCost += MissPenalty; | |||
| 679 | ||||
| 680 | CurrPipeline[CurrSyncGroupIdx] = SyncPipeline; | |||
| 681 | advancePosition(); | |||
| 682 | } | |||
| 683 | BestPipeline = CurrPipeline; | |||
| 684 | removeEdges(AddedEdges); | |||
| 685 | return false; | |||
| 686 | } | |||
| 687 | ||||
| 688 | unsigned PipelineSolver::computeProblemSize() { | |||
| 689 | unsigned ProblemSize = 0; | |||
| 690 | for (auto &PipeConflicts : PipelineInstrs) { | |||
| 691 | ProblemSize += PipeConflicts.size(); | |||
| 692 | } | |||
| 693 | ||||
| 694 | return ProblemSize; | |||
| 695 | } | |||
| 696 | ||||
| 697 | void PipelineSolver::solve() { | |||
| 698 | if (!NeedsSolver) | |||
| 699 | return; | |||
| 700 | ||||
| 701 | unsigned ProblemSize = computeProblemSize(); | |||
| 702 | assert(ProblemSize > 0)(static_cast <bool> (ProblemSize > 0) ? void (0) : __assert_fail ("ProblemSize > 0", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp" , 702, __extension__ __PRETTY_FUNCTION__)); | |||
| 703 | ||||
| 704 | bool BelowCutoff = (CutoffForExact > 0) && ProblemSize <= CutoffForExact; | |||
| 705 | MissPenalty = (ProblemSize / 2) + 1; | |||
| 706 | ||||
| 707 | LLVM_DEBUG(DAG->dump())do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { DAG->dump(); } } while (false); | |||
| 708 | if (EnableExactSolver || BelowCutoff) { | |||
| 709 | LLVM_DEBUG(dbgs() << "Starting Greedy pipeline solver\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Starting Greedy pipeline solver\n" ; } } while (false); | |||
| 710 | solveGreedy(); | |||
| 711 | reset(); | |||
| 712 | LLVM_DEBUG(dbgs() << "Greedy produced best cost of " << BestCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Greedy produced best cost of " << BestCost << "\n"; } } while (false); | |||
| 713 | if (BestCost > 0) { | |||
| 714 | LLVM_DEBUG(dbgs() << "Starting EXACT pipeline solver\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Starting EXACT pipeline solver\n" ; } } while (false); | |||
| 715 | solveExact(); | |||
| 716 | LLVM_DEBUG(dbgs() << "Exact produced best cost of " << BestCost << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Exact produced best cost of " << BestCost << "\n"; } } while (false); | |||
| 717 | } | |||
| 718 | } else { // Use the Greedy Algorithm by default | |||
| 719 | LLVM_DEBUG(dbgs() << "Starting GREEDY pipeline solver\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Starting GREEDY pipeline solver\n" ; } } while (false); | |||
| 720 | solveGreedy(); | |||
| 721 | } | |||
| 722 | ||||
| 723 | makePipeline(); | |||
| 724 | } | |||
| 725 | ||||
| 726 | enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 }; | |||
| 727 | ||||
| 728 | // Implement a IGLP scheduling strategy. | |||
| 729 | class IGLPStrategy { | |||
| 730 | protected: | |||
| 731 | ScheduleDAGInstrs *DAG; | |||
| 732 | ||||
| 733 | const SIInstrInfo *TII; | |||
| 734 | ||||
| 735 | public: | |||
| 736 | // Add SchedGroups to \p Pipeline to implement this Strategy. | |||
| 737 | virtual void applyIGLPStrategy( | |||
| 738 | DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, | |||
| 739 | DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0; | |||
| 740 | ||||
| 741 | // Returns true if this strategy should be applied to a ScheduleDAG. | |||
| 742 | virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0; | |||
| 743 | ||||
| 744 | IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) | |||
| 745 | : DAG(DAG), TII(TII) {} | |||
| 746 | ||||
| 747 | virtual ~IGLPStrategy() = default; | |||
| 748 | }; | |||
| 749 | ||||
| 750 | class MFMASmallGemmOpt final : public IGLPStrategy { | |||
| 751 | public: | |||
| 752 | void applyIGLPStrategy( | |||
| 753 | DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, | |||
| 754 | DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override; | |||
| 755 | ||||
| 756 | bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; } | |||
| 757 | ||||
| 758 | MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII) | |||
| 759 | : IGLPStrategy(DAG, TII) {} | |||
| 760 | }; | |||
| 761 | ||||
| 762 | void MFMASmallGemmOpt::applyIGLPStrategy( | |||
| 763 | DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs, | |||
| 764 | DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) { | |||
| 765 | // Count the number of MFMA instructions. | |||
| 766 | unsigned MFMACount = 0; | |||
| 767 | for (const MachineInstr &I : *DAG) | |||
| 768 | if (TII->isMFMAorWMMA(I)) | |||
| 769 | ++MFMACount; | |||
| 770 | ||||
| 771 | const unsigned PipelineSyncID = 0; | |||
| 772 | SchedGroup *SG = nullptr; | |||
| 773 | for (unsigned I = 0; I < MFMACount * 3; ++I) { | |||
| 774 | SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( | |||
| 775 | SchedGroupMask::DS, 2, PipelineSyncID, DAG, TII); | |||
| 776 | SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); | |||
| 777 | ||||
| 778 | SG = &SyncedSchedGroups[PipelineSyncID].emplace_back( | |||
| 779 | SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII); | |||
| 780 | SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]); | |||
| 781 | } | |||
| 782 | } | |||
| 783 | ||||
| 784 | static std::unique_ptr<IGLPStrategy> | |||
| 785 | createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG, | |||
| 786 | const SIInstrInfo *TII) { | |||
| 787 | switch (ID) { | |||
| 788 | case MFMASmallGemmOptID: | |||
| 789 | return std::make_unique<MFMASmallGemmOpt>(DAG, TII); | |||
| 790 | } | |||
| 791 | ||||
| 792 | llvm_unreachable("Unknown IGLPStrategyID")::llvm::llvm_unreachable_internal("Unknown IGLPStrategyID", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp" , 792); | |||
| 793 | } | |||
| 794 | ||||
| 795 | class IGroupLPDAGMutation : public ScheduleDAGMutation { | |||
| 796 | private: | |||
| 797 | const SIInstrInfo *TII; | |||
| 798 | ||||
| 799 | ScheduleDAGMI *DAG; | |||
| 800 | ||||
| 801 | // Organize lists of SchedGroups by their SyncID. SchedGroups / | |||
| 802 | // SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added | |||
| 803 | // between then. | |||
| 804 | DenseMap<int, SmallVector<SchedGroup, 4>> SyncedSchedGroups; | |||
| 805 | ||||
| 806 | // Used to track instructions that can be mapped to multiple sched groups | |||
| 807 | DenseMap<int, SUnitsToCandidateSGsMap> SyncedInstrs; | |||
| 808 | ||||
| 809 | // Add DAG edges that enforce SCHED_BARRIER ordering. | |||
| 810 | void addSchedBarrierEdges(SUnit &SU); | |||
| 811 | ||||
| 812 | // Use a SCHED_BARRIER's mask to identify instruction SchedGroups that should | |||
| 813 | // not be reordered accross the SCHED_BARRIER. This is used for the base | |||
| 814 | // SCHED_BARRIER, and not SCHED_GROUP_BARRIER. The difference is that | |||
| 815 | // SCHED_BARRIER will always block all instructions that can be classified | |||
| 816 | // into a particular SchedClass, whereas SCHED_GROUP_BARRIER has a fixed size | |||
| 817 | // and may only synchronize with some SchedGroups. Returns the inverse of | |||
| 818 | // Mask. SCHED_BARRIER's mask describes which instruction types should be | |||
| 819 | // allowed to be scheduled across it. Invert the mask to get the | |||
| 820 | // SchedGroupMask of instructions that should be barred. | |||
| 821 | SchedGroupMask invertSchedBarrierMask(SchedGroupMask Mask) const; | |||
| 822 | ||||
| 823 | // Create SchedGroups for a SCHED_GROUP_BARRIER. | |||
| 824 | void initSchedGroupBarrierPipelineStage( | |||
| 825 | std::vector<SUnit>::reverse_iterator RIter); | |||
| 826 | ||||
| 827 | void initIGLPOpt(SUnit &SU); | |||
| 828 | ||||
| 829 | public: | |||
| 830 | void apply(ScheduleDAGInstrs *DAGInstrs) override; | |||
| 831 | ||||
| 832 | IGroupLPDAGMutation() = default; | |||
| 833 | }; | |||
| 834 | ||||
| 835 | unsigned SchedGroup::NumSchedGroups = 0; | |||
| 836 | ||||
| 837 | bool SchedGroup::tryAddEdge(SUnit *A, SUnit *B) { | |||
| 838 | if (A != B && DAG->canAddEdge(B, A)) { | |||
| 839 | DAG->addEdge(B, SDep(A, SDep::Artificial)); | |||
| 840 | return true; | |||
| 841 | } | |||
| 842 | return false; | |||
| 843 | } | |||
| 844 | ||||
| 845 | bool SchedGroup::canAddMI(const MachineInstr &MI) const { | |||
| 846 | bool Result = false; | |||
| 847 | if (MI.isMetaInstruction()) | |||
| 848 | Result = false; | |||
| 849 | ||||
| 850 | else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && | |||
| 851 | (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI))) | |||
| 852 | Result = true; | |||
| 853 | ||||
| 854 | else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && | |||
| 855 | TII->isVALU(MI) && !TII->isMFMAorWMMA(MI)) | |||
| 856 | Result = true; | |||
| 857 | ||||
| 858 | else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && | |||
| 859 | TII->isSALU(MI)) | |||
| 860 | Result = true; | |||
| 861 | ||||
| 862 | else if (((SGMask & SchedGroupMask::MFMA) != SchedGroupMask::NONE) && | |||
| 863 | TII->isMFMAorWMMA(MI)) | |||
| 864 | Result = true; | |||
| 865 | ||||
| 866 | else if (((SGMask & SchedGroupMask::VMEM) != SchedGroupMask::NONE) && | |||
| 867 | (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) | |||
| 868 | Result = true; | |||
| 869 | ||||
| 870 | else if (((SGMask & SchedGroupMask::VMEM_READ) != SchedGroupMask::NONE) && | |||
| 871 | MI.mayLoad() && | |||
| 872 | (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) | |||
| 873 | Result = true; | |||
| 874 | ||||
| 875 | else if (((SGMask & SchedGroupMask::VMEM_WRITE) != SchedGroupMask::NONE) && | |||
| 876 | MI.mayStore() && | |||
| 877 | (TII->isVMEM(MI) || (TII->isFLAT(MI) && !TII->isDS(MI)))) | |||
| 878 | Result = true; | |||
| 879 | ||||
| 880 | else if (((SGMask & SchedGroupMask::DS) != SchedGroupMask::NONE) && | |||
| 881 | TII->isDS(MI)) | |||
| 882 | Result = true; | |||
| 883 | ||||
| 884 | else if (((SGMask & SchedGroupMask::DS_READ) != SchedGroupMask::NONE) && | |||
| 885 | MI.mayLoad() && TII->isDS(MI)) | |||
| 886 | Result = true; | |||
| 887 | ||||
| 888 | else if (((SGMask & SchedGroupMask::DS_WRITE) != SchedGroupMask::NONE) && | |||
| 889 | MI.mayStore() && TII->isDS(MI)) | |||
| 890 | Result = true; | |||
| 891 | ||||
| 892 | LLVM_DEBUG(do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << (Result ? " could classify " : " unable to classify ") << MI; } } while (false) | |||
| 893 | dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << (Result ? " could classify " : " unable to classify ") << MI; } } while (false) | |||
| 894 | << (Result ? " could classify " : " unable to classify ") << MI)do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << (Result ? " could classify " : " unable to classify ") << MI; } } while (false); | |||
| 895 | ||||
| 896 | return Result; | |||
| 897 | } | |||
| 898 | ||||
| 899 | int SchedGroup::link(SUnit &SU, bool MakePred, | |||
| 900 | std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) { | |||
| 901 | int MissedEdges = 0; | |||
| 902 | for (auto *A : Collection) { | |||
| 903 | SUnit *B = &SU; | |||
| 904 | if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) | |||
| 905 | continue; | |||
| 906 | if (MakePred) | |||
| 907 | std::swap(A, B); | |||
| 908 | ||||
| 909 | if (DAG->IsReachable(B, A)) | |||
| 910 | continue; | |||
| 911 | // tryAddEdge returns false if there is a dependency that makes adding | |||
| 912 | // the A->B edge impossible, otherwise it returns true; | |||
| 913 | bool Added = tryAddEdge(A, B); | |||
| 914 | if (Added) | |||
| 915 | AddedEdges.push_back(std::pair(A, B)); | |||
| 916 | else | |||
| 917 | ++MissedEdges; | |||
| 918 | } | |||
| 919 | ||||
| 920 | return MissedEdges; | |||
| 921 | } | |||
| 922 | ||||
| 923 | void SchedGroup::link(SUnit &SU, bool MakePred) { | |||
| 924 | for (auto *A : Collection) { | |||
| 925 | SUnit *B = &SU; | |||
| 926 | if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER) | |||
| 927 | continue; | |||
| 928 | if (MakePred) | |||
| 929 | std::swap(A, B); | |||
| 930 | ||||
| 931 | tryAddEdge(A, B); | |||
| 932 | } | |||
| 933 | } | |||
| 934 | ||||
| 935 | void SchedGroup::link(SUnit &SU, | |||
| 936 | function_ref<bool(const SUnit *A, const SUnit *B)> P) { | |||
| 937 | for (auto *A : Collection) { | |||
| 938 | SUnit *B = &SU; | |||
| 939 | if (P(A, B)) | |||
| 940 | std::swap(A, B); | |||
| 941 | ||||
| 942 | tryAddEdge(A, B); | |||
| 943 | } | |||
| 944 | } | |||
| 945 | ||||
| 946 | void SchedGroup::link(SchedGroup &OtherGroup) { | |||
| 947 | for (auto *B : OtherGroup.Collection) | |||
| 948 | link(*B); | |||
| 949 | } | |||
| 950 | ||||
| 951 | bool SchedGroup::canAddSU(SUnit &SU) const { | |||
| 952 | MachineInstr &MI = *SU.getInstr(); | |||
| 953 | if (MI.getOpcode() != TargetOpcode::BUNDLE) | |||
| 954 | return canAddMI(MI); | |||
| 955 | ||||
| 956 | // Special case for bundled MIs. | |||
| 957 | const MachineBasicBlock *MBB = MI.getParent(); | |||
| 958 | MachineBasicBlock::instr_iterator B = MI.getIterator(), E = ++B; | |||
| 959 | while (E != MBB->end() && E->isBundledWithPred()) | |||
| 960 | ++E; | |||
| 961 | ||||
| 962 | // Return true if all of the bundled MIs can be added to this group. | |||
| 963 | return std::all_of(B, E, [this](MachineInstr &MI) { return canAddMI(MI); }); | |||
| 964 | } | |||
| 965 | ||||
| 966 | void SchedGroup::initSchedGroup() { | |||
| 967 | for (auto &SU : DAG->SUnits) { | |||
| 968 | if (isFull()) | |||
| 969 | break; | |||
| 970 | ||||
| 971 | if (canAddSU(SU)) | |||
| 972 | add(SU); | |||
| 973 | } | |||
| 974 | } | |||
| 975 | ||||
| 976 | void SchedGroup::initSchedGroup(std::vector<SUnit>::reverse_iterator RIter, | |||
| 977 | SUnitsToCandidateSGsMap &SyncedInstrs) { | |||
| 978 | SUnit &InitSU = *RIter; | |||
| 979 | for (auto E = DAG->SUnits.rend(); RIter != E; ++RIter) { | |||
| 980 | auto &SU = *RIter; | |||
| 981 | if (isFull()) | |||
| 982 | break; | |||
| 983 | ||||
| 984 | if (canAddSU(SU)) | |||
| 985 | SyncedInstrs[&SU].push_back(SGID); | |||
| 986 | } | |||
| 987 | ||||
| 988 | add(InitSU); | |||
| 989 | assert(MaxSize)(static_cast <bool> (MaxSize) ? void (0) : __assert_fail ("MaxSize", "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 989 , __extension__ __PRETTY_FUNCTION__)); | |||
| 990 | (*MaxSize)++; | |||
| 991 | } | |||
| 992 | ||||
| 993 | void SchedGroup::initSchedGroup(SUnitsToCandidateSGsMap &SyncedInstrs) { | |||
| 994 | auto I = DAG->SUnits.rbegin(); | |||
| 995 | auto E = DAG->SUnits.rend(); | |||
| 996 | for (; I != E; ++I) { | |||
| 997 | auto &SU = *I; | |||
| 998 | if (isFull()) | |||
| 999 | break; | |||
| 1000 | ||||
| 1001 | if (canAddSU(SU)) | |||
| 1002 | SyncedInstrs[&SU].push_back(SGID); | |||
| 1003 | } | |||
| 1004 | } | |||
| 1005 | ||||
| 1006 | void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) { | |||
| 1007 | const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); | |||
| 1008 | if (!TSchedModel || DAGInstrs->SUnits.empty()) | |||
| ||||
| 1009 | return; | |||
| 1010 | ||||
| 1011 | LLVM_DEBUG(dbgs() << "Applying IGroupLPDAGMutation...\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("igrouplp")) { dbgs() << "Applying IGroupLPDAGMutation...\n" ; } } while (false); | |||
| 1012 | const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>(); | |||
| 1013 | TII = ST.getInstrInfo(); | |||
| 1014 | DAG = static_cast<ScheduleDAGMI *>(DAGInstrs); | |||
| 1015 | SyncedSchedGroups.clear(); | |||
| 1016 | SyncedInstrs.clear(); | |||
| 1017 | bool foundSB = false; | |||
| 1018 | bool foundIGLP = false; | |||
| 1019 | for (auto R = DAG->SUnits.rbegin(), E = DAG->SUnits.rend(); R != E; ++R) { | |||
| 1020 | unsigned Opc = R->getInstr()->getOpcode(); | |||
| 1021 | // SCHED_[GROUP_]BARRIER and IGLP are mutually exclusive. | |||
| 1022 | if (Opc == AMDGPU::SCHED_BARRIER) { | |||
| 1023 | addSchedBarrierEdges(*R); | |||
| 1024 | foundSB = true; | |||
| 1025 | } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) { | |||
| 1026 | initSchedGroupBarrierPipelineStage(R); | |||
| 1027 | foundSB = true; | |||
| 1028 | } else if (Opc == AMDGPU::IGLP_OPT) { | |||
| 1029 | resetEdges(*R, DAG); | |||
| 1030 | if (!foundSB && !foundIGLP) | |||
| 1031 | initIGLPOpt(*R); | |||
| 1032 | foundIGLP = true; | |||
| 1033 | } | |||
| 1034 | } | |||
| 1035 | ||||
| 1036 | if (foundSB
| |||
| 1037 | PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG); | |||
| 1038 | // PipelineSolver performs the mutation by adding the edges it | |||
| 1039 | // determined as the best | |||
| 1040 | PS.solve(); | |||
| 1041 | } | |||
| 1042 | } | |||
| 1043 | ||||
| 1044 | void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { | |||
| 1045 | MachineInstr &MI = *SchedBarrier.getInstr(); | |||
| 1046 | assert(MI.getOpcode() == AMDGPU::SCHED_BARRIER)(static_cast <bool> (MI.getOpcode() == AMDGPU::SCHED_BARRIER ) ? void (0) : __assert_fail ("MI.getOpcode() == AMDGPU::SCHED_BARRIER" , "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 1046, __extension__ __PRETTY_FUNCTION__)); | |||
| 1047 | // Remove all existing edges from the SCHED_BARRIER that were added due to the | |||
| 1048 | // instruction having side effects. | |||
| 1049 | resetEdges(SchedBarrier, DAG); | |||
| 1050 | auto InvertedMask = | |||
| 1051 | invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); | |||
| 1052 | SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); | |||
| 1053 | SG.initSchedGroup(); | |||
| 1054 | // Preserve original instruction ordering relative to the SCHED_BARRIER. | |||
| 1055 | SG.link( | |||
| 1056 | SchedBarrier, | |||
| 1057 | (function_ref<bool(const SUnit *A, const SUnit *B)>)[]( | |||
| 1058 | const SUnit *A, const SUnit *B) { return A->NodeNum > B->NodeNum; }); | |||
| 1059 | } | |||
| 1060 | ||||
| 1061 | SchedGroupMask | |||
| 1062 | IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { | |||
| 1063 | // Invert mask and erase bits for types of instructions that are implied to be | |||
| 1064 | // allowed past the SCHED_BARRIER. | |||
| 1065 | SchedGroupMask InvertedMask = ~Mask; | |||
| 1066 | ||||
| 1067 | // ALU implies VALU, SALU, MFMA. | |||
| 1068 | if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) | |||
| 1069 | InvertedMask &= | |||
| 1070 | ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA; | |||
| 1071 | // VALU, SALU, MFMA implies ALU. | |||
| 1072 | else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || | |||
| 1073 | (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || | |||
| 1074 | (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE) | |||
| 1075 | InvertedMask &= ~SchedGroupMask::ALU; | |||
| 1076 | ||||
| 1077 | // VMEM implies VMEM_READ, VMEM_WRITE. | |||
| 1078 | if ((InvertedMask & SchedGroupMask::VMEM) == SchedGroupMask::NONE) | |||
| 1079 | InvertedMask &= ~SchedGroupMask::VMEM_READ & ~SchedGroupMask::VMEM_WRITE; | |||
| 1080 | // VMEM_READ, VMEM_WRITE implies VMEM. | |||
| 1081 | else if ((InvertedMask & SchedGroupMask::VMEM_READ) == SchedGroupMask::NONE || | |||
| 1082 | (InvertedMask & SchedGroupMask::VMEM_WRITE) == SchedGroupMask::NONE) | |||
| 1083 | InvertedMask &= ~SchedGroupMask::VMEM; | |||
| 1084 | ||||
| 1085 | // DS implies DS_READ, DS_WRITE. | |||
| 1086 | if ((InvertedMask & SchedGroupMask::DS) == SchedGroupMask::NONE) | |||
| 1087 | InvertedMask &= ~SchedGroupMask::DS_READ & ~SchedGroupMask::DS_WRITE; | |||
| 1088 | // DS_READ, DS_WRITE implies DS. | |||
| 1089 | else if ((InvertedMask & SchedGroupMask::DS_READ) == SchedGroupMask::NONE || | |||
| 1090 | (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) | |||
| 1091 | InvertedMask &= ~SchedGroupMask::DS; | |||
| 1092 | ||||
| 1093 | return InvertedMask; | |||
| 1094 | } | |||
| 1095 | ||||
| 1096 | void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage( | |||
| 1097 | std::vector<SUnit>::reverse_iterator RIter) { | |||
| 1098 | // Remove all existing edges from the SCHED_GROUP_BARRIER that were added due | |||
| 1099 | // to the instruction having side effects. | |||
| 1100 | resetEdges(*RIter, DAG); | |||
| 1101 | MachineInstr &SGB = *RIter->getInstr(); | |||
| 1102 | assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)(static_cast <bool> (SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ) ? void (0) : __assert_fail ("SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER" , "llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp", 1102, __extension__ __PRETTY_FUNCTION__)); | |||
| 1103 | int32_t SGMask = SGB.getOperand(0).getImm(); | |||
| 1104 | int32_t Size = SGB.getOperand(1).getImm(); | |||
| 1105 | int32_t SyncID = SGB.getOperand(2).getImm(); | |||
| 1106 | ||||
| 1107 | auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask, | |||
| 1108 | Size, SyncID, DAG, TII); | |||
| 1109 | ||||
| 1110 | SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]); | |||
| 1111 | } | |||
| 1112 | ||||
| 1113 | void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) { | |||
| 1114 | IGLPStrategyID StrategyID = | |||
| 1115 | (IGLPStrategyID)SU.getInstr()->getOperand(0).getImm(); | |||
| 1116 | auto S = createIGLPStrategy(StrategyID, DAG, TII); | |||
| 1117 | if (S->shouldApplyStrategy(DAG)) | |||
| 1118 | S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups); | |||
| 1119 | } | |||
| 1120 | ||||
| 1121 | } // namespace | |||
| 1122 | ||||
| 1123 | namespace llvm { | |||
| 1124 | ||||
| 1125 | std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() { | |||
| 1126 | return std::make_unique<IGroupLPDAGMutation>(); | |||
| 1127 | } | |||
| 1128 | ||||
| 1129 | } // end namespace llvm |