8#ifndef _popops_performance_estimation_h_
9#define _popops_performance_estimation_h_
12#include <poplar/PerfEstimateFunc.hpp>
13#include <poplar/Target.hpp>
14#include <poplar/VectorLayout.hpp>
44 const unsigned vectorSize,
45 const unsigned cyclesPerVector);
62 const unsigned cyclesPerVector,
64 const unsigned numElems,
65 const std::uint64_t overheadPerLoop);
78 const unsigned regionSize,
79 const unsigned numSubElements);
92 const popops::expr::BinaryOpType op,
93 const unsigned numElems);
104 unsigned atomicWriteSize;
105 unsigned dataPathWidth;
106 unsigned bytesPerElem;
107 unsigned numWorkerContexts;
110std::uint64_t getMultiSliceCycleEstimate(
112 const unsigned elemsPerSlice,
const unsigned numOffsets,
113 const unsigned numOffsetsInRangePerWorker,
114 const unsigned offsetsPerDictEntry,
const bool isUpdate =
false,
115 const bool indicesAreSorted =
false,
const bool splitSingleRegion =
false);
125 unsigned atomicWriteSize;
126 unsigned numWorkerContexts;
127 unsigned bytesPerElem;
130std::uint64_t getMultiUpdateOpCycleEstimate(
132 bool subWordWritesRequired,
const unsigned elemsPerSlice,
133 const unsigned numOffsets,
const unsigned numOffsetsInRangePerWorker,
134 const unsigned offsetsPerDictEntry,
const Operation op,
const bool scaled,
135 const bool scaleHigherPrecisionThanData =
false,
136 const bool indicesAreSorted =
false);
146 unsigned numWorkerContexts;
147 unsigned dataPathWidth;
148 unsigned fromTypeSize;
155 std::vector<unsigned> &elemCounts);
157std::uint64_t getCast1DSingleWorkerCycleEstimate(
164 const unsigned numElems);
166struct FillTargetParameters {
168 : dataPathWidth(target.getDataPathWidth()) {}
169 unsigned dataPathWidth;
172std::uint64_t getFill1DCycleEstimate(
const FillTargetParameters &targetParams,
174 const unsigned numElems);
176std::uint64_t getFill2DCycleEstimate(
const FillTargetParameters &targetParams,
178 const std::vector<unsigned> &numElems);
180enum class ScaledArithmeticOp { ADD, SUBTRACT, AXPLUSBY, AXMINUSBY };
182struct ScaledArithmeticTargetParameters {
185 : numWorkerContexts(target.getNumWorkerContexts()),
186 vectorWidth(target.getVectorWidth(dataType)) {}
187 unsigned numWorkerContexts;
188 unsigned vectorWidth;
191std::uint64_t getScaledArithmeticSupervisorCycleEstimate(
192 const ScaledArithmeticTargetParameters &targetParams,
194 const bool memConstrained,
const ScaledArithmeticOp operation,
199poplar::VertexPerfEstimate broadcastArithmetic1DCycleEstimate(
202 std::size_t dataSize);
205poplar::VertexPerfEstimate broadcastArithmeticCycleEstimate(
208 bool uniformScalar,
const std::vector<std::size_t> &data);
Expressions with elements of tensors.
Define types of operations used in Reduce/MultiUpdate.
A target representation.
Definition: Target.hpp:69
std::size_t getTypeSize(const Type &) const
Get the size of a given type in bytes.
unsigned getDataPathWidth() const
The width of the load/store data path within the tile.
std::size_t getAtomicStoreGranularity() const
Get the granularity of atomic stores that can be made by independent parallel worker threads.
unsigned getNumWorkerContexts() const
The number of worker contexts per tile.
Class representing device data types.
Definition: Type.hpp:42
Vector
An enumeration used to state what type of pointer is used for a Vector vertex field.
Definition: VectorLayout.hpp:15
Common functions, such as elementwise and reductions.
Definition: AllTrue.hpp:15
Operation
Type of operation to use in a reduction.
Definition: OperationDef.hpp:15
Target parameters used in cast estimation.
Definition: PerformanceEstimation.hpp:139
Cycle estimate for MultiSlice vertex.
Definition: PerformanceEstimation.hpp:97
Cycle estimate for MultiUpdateAdd vertex.
Definition: PerformanceEstimation.hpp:119