LCOV - code coverage report
Current view: top level - include/llvm/MC - MCSchedule.h (source / functions) Hit Total Coverage
Test: llvm-toolchain.info Lines: 9 20 45.0 %
Date: 2018-10-20 13:21:21 Functions: 0 10 0.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : //===-- llvm/MC/MCSchedule.h - Scheduling -----------------------*- C++ -*-===//
       2             : //
       3             : //                     The LLVM Compiler Infrastructure
       4             : //
       5             : // This file is distributed under the University of Illinois Open Source
       6             : // License. See LICENSE.TXT for details.
       7             : //
       8             : //===----------------------------------------------------------------------===//
       9             : //
      10             : // This file defines the classes used to describe a subtarget's machine model
      11             : // for scheduling and other instruction cost heuristics.
      12             : //
      13             : //===----------------------------------------------------------------------===//
      14             : 
      15             : #ifndef LLVM_MC_MCSCHEDULE_H
      16             : #define LLVM_MC_MCSCHEDULE_H
      17             : 
      18             : #include "llvm/ADT/Optional.h"
      19             : #include "llvm/Config/llvm-config.h"
      20             : #include "llvm/Support/DataTypes.h"
      21             : #include <cassert>
      22             : 
      23             : namespace llvm {
      24             : 
      25             : struct InstrItinerary;
      26             : class MCSubtargetInfo;
      27             : class MCInstrInfo;
      28             : class MCInst;
      29             : class InstrItineraryData;
      30             : 
      31             : /// Define a kind of processor resource that will be modeled by the scheduler.
      32             : struct MCProcResourceDesc {
      33             :   const char *Name;
      34             :   unsigned NumUnits; // Number of resource of this kind
      35             :   unsigned SuperIdx; // Index of the resources kind that contains this kind.
      36             : 
      37             :   // Number of resources that may be buffered.
      38             :   //
      39             :   // Buffered resources (BufferSize != 0) may be consumed at some indeterminate
      40             :   // cycle after dispatch. This should be used for out-of-order cpus when
      41             :   // instructions that use this resource can be buffered in a reservaton
      42             :   // station.
      43             :   //
      44             :   // Unbuffered resources (BufferSize == 0) always consume their resource some
      45             :   // fixed number of cycles after dispatch. If a resource is unbuffered, then
      46             :   // the scheduler will avoid scheduling instructions with conflicting resources
      47             :   // in the same cycle. This is for in-order cpus, or the in-order portion of
      48             :   // an out-of-order cpus.
      49             :   int BufferSize;
      50             : 
      51             :   // If the resource has sub-units, a pointer to the first element of an array
      52             :   // of `NumUnits` elements containing the ProcResourceIdx of the sub units.
      53             :   // nullptr if the resource does not have sub-units.
      54             :   const unsigned *SubUnitsIdxBegin;
      55             : 
      56             :   bool operator==(const MCProcResourceDesc &Other) const {
      57             :     return NumUnits == Other.NumUnits && SuperIdx == Other.SuperIdx
      58             :       && BufferSize == Other.BufferSize;
      59             :   }
      60             : };
      61             : 
      62             : /// Identify one of the processor resource kinds consumed by a particular
      63             : /// scheduling class for the specified number of cycles.
      64             : struct MCWriteProcResEntry {
      65             :   uint16_t ProcResourceIdx;
      66             :   uint16_t Cycles;
      67             : 
      68           0 :   bool operator==(const MCWriteProcResEntry &Other) const {
      69     4000687 :     return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles;
      70             :   }
      71             : };
      72             : 
      73             : /// Specify the latency in cpu cycles for a particular scheduling class and def
      74             : /// index. -1 indicates an invalid latency. Heuristics would typically consider
      75             : /// an instruction with invalid latency to have infinite latency.  Also identify
      76             : /// the WriteResources of this def. When the operand expands to a sequence of
      77             : /// writes, this ID is the last write in the sequence.
      78             : struct MCWriteLatencyEntry {
      79             :   int16_t Cycles;
      80             :   uint16_t WriteResourceID;
      81             : 
      82           0 :   bool operator==(const MCWriteLatencyEntry &Other) const {
      83      364295 :     return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID;
      84             :   }
      85             : };
      86             : 
      87             : /// Specify the number of cycles allowed after instruction issue before a
      88             : /// particular use operand reads its registers. This effectively reduces the
      89             : /// write's latency. Here we allow negative cycles for corner cases where
      90             : /// latency increases. This rule only applies when the entry's WriteResource
      91             : /// matches the write's WriteResource.
      92             : ///
      93             : /// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by
      94             : /// WriteResourceIdx.
      95             : struct MCReadAdvanceEntry {
      96             :   unsigned UseIdx;
      97             :   unsigned WriteResourceID;
      98             :   int Cycles;
      99             : 
     100             :   bool operator==(const MCReadAdvanceEntry &Other) const {
     101       95460 :     return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID
     102      131572 :       && Cycles == Other.Cycles;
     103             :   }
     104             : };
     105             : 
     106             : /// Summarize the scheduling resources required for an instruction of a
     107             : /// particular scheduling class.
     108             : ///
     109             : /// Defined as an aggregate struct for creating tables with initializer lists.
     110             : struct MCSchedClassDesc {
     111             :   static const unsigned short InvalidNumMicroOps = (1U << 14) - 1;
     112             :   static const unsigned short VariantNumMicroOps = InvalidNumMicroOps - 1;
     113             : 
     114             : #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     115             :   const char* Name;
     116             : #endif
     117             :   uint16_t NumMicroOps : 14;
     118             :   bool     BeginGroup : 1;
     119             :   bool     EndGroup : 1;
     120             :   uint16_t WriteProcResIdx; // First index into WriteProcResTable.
     121             :   uint16_t NumWriteProcResEntries;
     122             :   uint16_t WriteLatencyIdx; // First index into WriteLatencyTable.
     123             :   uint16_t NumWriteLatencyEntries;
     124             :   uint16_t ReadAdvanceIdx; // First index into ReadAdvanceTable.
     125             :   uint16_t NumReadAdvanceEntries;
     126             : 
     127             :   bool isValid() const {
     128    12713041 :     return NumMicroOps != InvalidNumMicroOps;
     129             :   }
     130             :   bool isVariant() const {
     131    16883789 :     return NumMicroOps == VariantNumMicroOps;
     132             :   }
     133             : };
     134             : 
     135             : /// Specify the cost of a register definition in terms of number of physical
     136             : /// register allocated at register renaming stage. For example, AMD Jaguar.
     137             : /// natively supports 128-bit data types, and operations on 256-bit registers
     138             : /// (i.e. YMM registers) are internally split into two COPs (complex operations)
     139             : /// and each COP updates a physical register. Basically, on Jaguar, a YMM
     140             : /// register write effectively consumes two physical registers. That means,
     141             : /// the cost of a YMM write in the BtVer2 model is 2.
     142             : struct MCRegisterCostEntry {
     143             :   unsigned RegisterClassID;
     144             :   unsigned Cost;
     145             :   bool AllowMoveElimination;
     146             : };
     147             : 
     148             : /// A register file descriptor.
     149             : ///
     150             : /// This struct allows to describe processor register files. In particular, it
     151             : /// helps describing the size of the register file, as well as the cost of
     152             : /// allocating a register file at register renaming stage.
     153             : /// FIXME: this struct can be extended to provide information about the number
     154             : /// of read/write ports to the register file.  A value of zero for field
     155             : /// 'NumPhysRegs' means: this register file has an unbounded number of physical
     156             : /// registers.
     157             : struct MCRegisterFileDesc {
     158             :   const char *Name;
     159             :   uint16_t NumPhysRegs;
     160             :   uint16_t NumRegisterCostEntries;
     161             :   // Index of the first cost entry in MCExtraProcessorInfo::RegisterCostTable.
     162             :   uint16_t RegisterCostEntryIdx;
     163             :   // A value of zero means: there is no limit in the number of moves that can be
     164             :   // eliminated every cycle.
     165             :   uint16_t MaxMovesEliminatedPerCycle;
     166             :   // Ture if this register file only knows how to optimize register moves from
     167             :   // known zero registers.
     168             :   bool AllowZeroMoveEliminationOnly;
     169             : };
     170             : 
     171             : /// Provide extra details about the machine processor.
     172             : ///
     173             : /// This is a collection of "optional" processor information that is not
     174             : /// normally used by the LLVM machine schedulers, but that can be consumed by
     175             : /// external tools like llvm-mca to improve the quality of the peformance
     176             : /// analysis.
     177             : struct MCExtraProcessorInfo {
     178             :   // Actual size of the reorder buffer in hardware.
     179             :   unsigned ReorderBufferSize;
     180             :   // Number of instructions retired per cycle.
     181             :   unsigned MaxRetirePerCycle;
     182             :   const MCRegisterFileDesc *RegisterFiles;
     183             :   unsigned NumRegisterFiles;
     184             :   const MCRegisterCostEntry *RegisterCostTable;
     185             :   unsigned NumRegisterCostEntries;
     186             : 
     187             :   struct PfmCountersInfo {
     188             :     // An optional name of a performance counter that can be used to measure
     189             :     // cycles.
     190             :     const char *CycleCounter;
     191             : 
     192             :     // An optional name of a performance counter that can be used to measure
     193             :     // uops.
     194             :     const char *UopsCounter;
     195             : 
     196             :     // For each MCProcResourceDesc defined by the processor, an optional list of
     197             :     // names of performance counters that can be used to measure the resource
     198             :     // utilization.
     199             :     const char **IssueCounters;
     200             :   };
     201             :   PfmCountersInfo PfmCounters;
     202             : };
     203             : 
     204             : /// Machine model for scheduling, bundling, and heuristics.
     205             : ///
     206             : /// The machine model directly provides basic information about the
     207             : /// microarchitecture to the scheduler in the form of properties. It also
     208             : /// optionally refers to scheduler resource tables and itinerary
     209             : /// tables. Scheduler resource tables model the latency and cost for each
     210             : /// instruction type. Itinerary tables are an independent mechanism that
     211             : /// provides a detailed reservation table describing each cycle of instruction
     212             : /// execution. Subtargets may define any or all of the above categories of data
     213             : /// depending on the type of CPU and selected scheduler.
     214             : ///
     215             : /// The machine independent properties defined here are used by the scheduler as
     216             : /// an abstract machine model. A real micro-architecture has a number of
     217             : /// buffers, queues, and stages. Declaring that a given machine-independent
     218             : /// abstract property corresponds to a specific physical property across all
     219             : /// subtargets can't be done. Nonetheless, the abstract model is
     220             : /// useful. Futhermore, subtargets typically extend this model with processor
     221             : /// specific resources to model any hardware features that can be exploited by
     222             : /// sceduling heuristics and aren't sufficiently represented in the abstract.
     223             : ///
     224             : /// The abstract pipeline is built around the notion of an "issue point". This
     225             : /// is merely a reference point for counting machine cycles. The physical
     226             : /// machine will have pipeline stages that delay execution. The scheduler does
     227             : /// not model those delays because they are irrelevant as long as they are
     228             : /// consistent. Inaccuracies arise when instructions have different execution
     229             : /// delays relative to each other, in addition to their intrinsic latency. Those
     230             : /// special cases can be handled by TableGen constructs such as, ReadAdvance,
     231             : /// which reduces latency when reading data, and ResourceCycles, which consumes
     232             : /// a processor resource when writing data for a number of abstract
     233             : /// cycles.
     234             : ///
     235             : /// TODO: One tool currently missing is the ability to add a delay to
     236             : /// ResourceCycles. That would be easy to add and would likely cover all cases
     237             : /// currently handled by the legacy itinerary tables.
     238             : ///
     239             : /// A note on out-of-order execution and, more generally, instruction
     240             : /// buffers. Part of the CPU pipeline is always in-order. The issue point, which
     241             : /// is the point of reference for counting cycles, only makes sense as an
     242             : /// in-order part of the pipeline. Other parts of the pipeline are sometimes
     243             : /// falling behind and sometimes catching up. It's only interesting to model
     244             : /// those other, decoupled parts of the pipeline if they may be predictably
     245             : /// resource constrained in a way that the scheduler can exploit.
     246             : ///
     247             : /// The LLVM machine model distinguishes between in-order constraints and
     248             : /// out-of-order constraints so that the target's scheduling strategy can apply
     249             : /// appropriate heuristics. For a well-balanced CPU pipeline, out-of-order
     250             : /// resources would not typically be treated as a hard scheduling
     251             : /// constraint. For example, in the GenericScheduler, a delay caused by limited
     252             : /// out-of-order resources is not directly reflected in the number of cycles
     253             : /// that the scheduler sees between issuing an instruction and its dependent
     254             : /// instructions. In other words, out-of-order resources don't directly increase
     255             : /// the latency between pairs of instructions. However, they can still be used
     256             : /// to detect potential bottlenecks across a sequence of instructions and bias
     257             : /// the scheduling heuristics appropriately.
     258             : struct MCSchedModel {
     259             :   // IssueWidth is the maximum number of instructions that may be scheduled in
     260             :   // the same per-cycle group. This is meant to be a hard in-order constraint
     261             :   // (a.k.a. "hazard"). In the GenericScheduler strategy, no more than
     262             :   // IssueWidth micro-ops can ever be scheduled in a particular cycle.
     263             :   //
     264             :   // In practice, IssueWidth is useful to model any bottleneck between the
     265             :   // decoder (after micro-op expansion) and the out-of-order reservation
     266             :   // stations or the decoder bandwidth itself. If the total number of
     267             :   // reservation stations is also a bottleneck, or if any other pipeline stage
     268             :   // has a bandwidth limitation, then that can be naturally modeled by adding an
     269             :   // out-of-order processor resource.
     270             :   unsigned IssueWidth;
     271             :   static const unsigned DefaultIssueWidth = 1;
     272             : 
     273             :   // MicroOpBufferSize is the number of micro-ops that the processor may buffer
     274             :   // for out-of-order execution.
     275             :   //
     276             :   // "0" means operations that are not ready in this cycle are not considered
     277             :   // for scheduling (they go in the pending queue). Latency is paramount. This
     278             :   // may be more efficient if many instructions are pending in a schedule.
     279             :   //
     280             :   // "1" means all instructions are considered for scheduling regardless of
     281             :   // whether they are ready in this cycle. Latency still causes issue stalls,
     282             :   // but we balance those stalls against other heuristics.
     283             :   //
     284             :   // "> 1" means the processor is out-of-order. This is a machine independent
     285             :   // estimate of highly machine specific characteristics such as the register
     286             :   // renaming pool and reorder buffer.
     287             :   unsigned MicroOpBufferSize;
     288             :   static const unsigned DefaultMicroOpBufferSize = 0;
     289             : 
     290             :   // LoopMicroOpBufferSize is the number of micro-ops that the processor may
     291             :   // buffer for optimized loop execution. More generally, this represents the
     292             :   // optimal number of micro-ops in a loop body. A loop may be partially
     293             :   // unrolled to bring the count of micro-ops in the loop body closer to this
     294             :   // number.
     295             :   unsigned LoopMicroOpBufferSize;
     296             :   static const unsigned DefaultLoopMicroOpBufferSize = 0;
     297             : 
     298             :   // LoadLatency is the expected latency of load instructions.
     299             :   unsigned LoadLatency;
     300             :   static const unsigned DefaultLoadLatency = 4;
     301             : 
     302             :   // HighLatency is the expected latency of "very high latency" operations.
     303             :   // See TargetInstrInfo::isHighLatencyDef().
     304             :   // By default, this is set to an arbitrarily high number of cycles
     305             :   // likely to have some impact on scheduling heuristics.
     306             :   unsigned HighLatency;
     307             :   static const unsigned DefaultHighLatency = 10;
     308             : 
     309             :   // MispredictPenalty is the typical number of extra cycles the processor
     310             :   // takes to recover from a branch misprediction.
     311             :   unsigned MispredictPenalty;
     312             :   static const unsigned DefaultMispredictPenalty = 10;
     313             : 
     314             :   bool PostRAScheduler; // default value is false
     315             : 
     316             :   bool CompleteModel;
     317             : 
     318             :   unsigned ProcID;
     319             :   const MCProcResourceDesc *ProcResourceTable;
     320             :   const MCSchedClassDesc *SchedClassTable;
     321             :   unsigned NumProcResourceKinds;
     322             :   unsigned NumSchedClasses;
     323             :   // Instruction itinerary tables used by InstrItineraryData.
     324             :   friend class InstrItineraryData;
     325             :   const InstrItinerary *InstrItineraries;
     326             : 
     327             :   const MCExtraProcessorInfo *ExtraProcessorInfo;
     328             : 
     329           0 :   bool hasExtraProcessorInfo() const { return ExtraProcessorInfo; }
     330             : 
     331           0 :   unsigned getProcessorID() const { return ProcID; }
     332             : 
     333             :   /// Does this machine model include instruction-level scheduling.
     334           0 :   bool hasInstrSchedModel() const { return SchedClassTable; }
     335             : 
     336           0 :   const MCExtraProcessorInfo &getExtraProcessorInfo() const {
     337             :     assert(hasExtraProcessorInfo() &&
     338             :            "No extra information available for this model");
     339           0 :     return *ExtraProcessorInfo;
     340             :   }
     341             : 
     342             :   /// Return true if this machine model data for all instructions with a
     343             :   /// scheduling class (itinerary class or SchedRW list).
     344             :   bool isComplete() const { return CompleteModel; }
     345             : 
     346             :   /// Return true if machine supports out of order execution.
     347       21333 :   bool isOutOfOrder() const { return MicroOpBufferSize > 1; }
     348             : 
     349           0 :   unsigned getNumProcResourceKinds() const {
     350           0 :     return NumProcResourceKinds;
     351             :   }
     352             : 
     353           0 :   const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const {
     354             :     assert(hasInstrSchedModel() && "No scheduling machine model");
     355             : 
     356             :     assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx");
     357    15938487 :     return &ProcResourceTable[ProcResourceIdx];
     358             :   }
     359             : 
     360           0 :   const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const {
     361             :     assert(hasInstrSchedModel() && "No scheduling machine model");
     362             : 
     363             :     assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx");
     364    17126870 :     return &SchedClassTable[SchedClassIdx];
     365             :   }
     366             : 
     367             :   /// Returns the latency value for the scheduling class.
     368             :   static int computeInstrLatency(const MCSubtargetInfo &STI,
     369             :                                  const MCSchedClassDesc &SCDesc);
     370             : 
     371             :   int computeInstrLatency(const MCSubtargetInfo &STI, unsigned SClass) const;
     372             :   int computeInstrLatency(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
     373             :                           const MCInst &Inst) const;
     374             : 
     375             :   // Returns the reciprocal throughput information from a MCSchedClassDesc.
     376             :   static double
     377             :   getReciprocalThroughput(const MCSubtargetInfo &STI,
     378             :                           const MCSchedClassDesc &SCDesc);
     379             : 
     380             :   static double
     381             :   getReciprocalThroughput(unsigned SchedClass, const InstrItineraryData &IID);
     382             : 
     383             :   double
     384             :   getReciprocalThroughput(const MCSubtargetInfo &STI, const MCInstrInfo &MCII,
     385             :                           const MCInst &Inst) const;
     386             : 
     387             :   /// Returns the default initialized model.
     388             :   static const MCSchedModel &GetDefaultSchedModel() { return Default; }
     389             :   static const MCSchedModel Default;
     390             : };
     391             : 
     392             : } // namespace llvm
     393             : 
     394             : #endif

Generated by: LCOV version 1.13