Poplar and PopLibs
PerformanceEstimation.hpp
Go to the documentation of this file.
1// Copyright (c) 2020 Graphcore Ltd. All rights reserved.
8#ifndef _popops_performance_estimation_h_
9#define _popops_performance_estimation_h_
10
11#include "popops/Expr.hpp"
12#include <poplar/PerfEstimateFunc.hpp>
13#include <poplar/Target.hpp>
14#include <poplar/VectorLayout.hpp>
16
17namespace popops {
18
19// All functions within the internal namespace are liable to be changed at any
20// point without prior notice.
21namespace internal {
22
31std::uint64_t basicOpSupervisorOverhead(const bool isScaledPtr64Type = false);
32
43std::uint64_t basicOpLoopCycles(const unsigned numElems,
44 const unsigned vectorSize,
45 const unsigned cyclesPerVector);
46
60std::uint64_t binaryOpInnerLoopCycles(const poplar::Target &target,
61 const poplar::Type &type,
62 const unsigned cyclesPerVector,
63 const bool vectorize,
64 const unsigned numElems,
65 const std::uint64_t overheadPerLoop);
66
76std::uint64_t getDynamicSlice1DEstimate(const poplar::Target &target,
77 const poplar::Type &type,
78 const unsigned regionSize,
79 const unsigned numSubElements);
80
90std::uint64_t getBinaryOp1DInPlaceEstimate(const poplar::Target &target,
91 const poplar::Type &type,
92 const popops::expr::BinaryOpType op,
93 const unsigned numElems);
94
99 const poplar::Type &type)
100 : atomicWriteSize(target.getAtomicStoreGranularity()),
101 dataPathWidth(target.getDataPathWidth()),
102 bytesPerElem(target.getTypeSize(type)),
103 numWorkerContexts(target.getNumWorkerContexts()) {}
104 unsigned atomicWriteSize;
105 unsigned dataPathWidth;
106 unsigned bytesPerElem;
107 unsigned numWorkerContexts;
108};
109
110std::uint64_t getMultiSliceCycleEstimate(
111 const MultiSliceTargetParameters &targetParams,
112 const unsigned elemsPerSlice, const unsigned numOffsets,
113 const unsigned numOffsetsInRangePerWorker,
114 const unsigned offsetsPerDictEntry, const bool isUpdate = false,
115 const bool indicesAreSorted = false, const bool splitSingleRegion = false);
116
121 const poplar::Type &type)
122 : atomicWriteSize(target.getAtomicStoreGranularity()),
123 numWorkerContexts(target.getNumWorkerContexts()),
124 bytesPerElem(target.getTypeSize(type)) {}
125 unsigned atomicWriteSize;
126 unsigned numWorkerContexts;
127 unsigned bytesPerElem;
128};
129
130std::uint64_t getMultiUpdateOpCycleEstimate(
131 const MultiUpdateOpTargetParameters &targetParams,
132 bool subWordWritesRequired, const unsigned elemsPerSlice,
133 const unsigned numOffsets, const unsigned numOffsetsInRangePerWorker,
134 const unsigned offsetsPerDictEntry, const Operation op, const bool scaled,
135 const bool scaleHigherPrecisionThanData = false,
136 const bool indicesAreSorted = false);
137
141 const poplar::Type &fromType, const poplar::Type &toType)
142 : numWorkerContexts(target.getNumWorkerContexts()),
143 dataPathWidth(target.getDataPathWidth()),
144 fromTypeSize(target.getTypeSize(fromType)),
145 toTypeSize(target.getTypeSize(toType)) {}
146 unsigned numWorkerContexts;
147 unsigned dataPathWidth;
148 unsigned fromTypeSize;
149 unsigned toTypeSize;
150};
151
152std::uint64_t getCast2DCycleEstimate(const CastTargetParameters &targetParams,
153 const poplar::Type &fromType,
154 const poplar::Type &toType,
155 std::vector<unsigned> &elemCounts);
156
157std::uint64_t getCast1DSingleWorkerCycleEstimate(
158 const CastTargetParameters &targetParams, const poplar::Type &fromType,
159 const poplar::Type &toType, const unsigned numElems);
160
161std::uint64_t getCast1DCycleEstimate(const CastTargetParameters &targetParams,
162 const poplar::Type &fromType,
163 const poplar::Type &toType,
164 const unsigned numElems);
165
166struct FillTargetParameters {
167 FillTargetParameters(const poplar::Target &target)
168 : dataPathWidth(target.getDataPathWidth()) {}
169 unsigned dataPathWidth;
170};
171
172std::uint64_t getFill1DCycleEstimate(const FillTargetParameters &targetParams,
173 const poplar::Type &type,
174 const unsigned numElems);
175
176std::uint64_t getFill2DCycleEstimate(const FillTargetParameters &targetParams,
177 const poplar::Type &type,
178 const std::vector<unsigned> &numElems);
179
180enum class ScaledArithmeticOp { ADD, SUBTRACT, AXPLUSBY, AXMINUSBY };
181
182struct ScaledArithmeticTargetParameters {
183 ScaledArithmeticTargetParameters(const poplar::Target &target,
184 const poplar::Type &dataType)
185 : numWorkerContexts(target.getNumWorkerContexts()),
186 vectorWidth(target.getVectorWidth(dataType)) {}
187 unsigned numWorkerContexts;
188 unsigned vectorWidth;
189};
190
191std::uint64_t getScaledArithmeticSupervisorCycleEstimate(
192 const ScaledArithmeticTargetParameters &targetParams,
193 const poplar::Type &dataType, const poplar::Type &dataBType,
194 const bool memConstrained, const ScaledArithmeticOp operation,
195 const poplar::layout::Vector &aLayout,
196 const poplar::layout::Vector &bLayout, const unsigned numElems);
197
198// Computes the cycles used by the scalar broadcast 1D codelet
199poplar::VertexPerfEstimate broadcastArithmetic1DCycleEstimate(
200 const poplar::Target &target, popops::expr::BinaryOpType op,
201 const poplar::Type &inType, const poplar::Type &outType, bool inPlace,
202 std::size_t dataSize);
203
204// Computes the cycles used by the scalar broadcast 2D codelet
205poplar::VertexPerfEstimate broadcastArithmeticCycleEstimate(
206 const poplar::Target &target, popops::expr::BinaryOpType op,
207 const poplar::Type &inType, const poplar::Type &outType, bool inPlace,
208 bool uniformScalar, const std::vector<std::size_t> &data);
209
210} // namespace internal
211} // namespace popops
212
213#endif // _popops_performance_estimation_h_
Expressions with elements of tensors.
Define types of operations used in Reduce/MultiUpdate.
std::uint64_t getBinaryOp1DInPlaceEstimate(const poplar::Target &target, const poplar::Type &type, const popops::expr::BinaryOpType op, const unsigned numElems)
Cycle estimate for Binary-1D In-Place MultiVertex.
std::uint64_t basicOpLoopCycles(const unsigned numElems, const unsigned vectorSize, const unsigned cyclesPerVector)
Cycle cost for processing an arbitrary number of elements given the cycle cost for processing a vecto...
std::uint64_t basicOpSupervisorOverhead(const bool isScaledPtr64Type=false)
Supervisor context cycle estimate for BinaryOp and UnaryOp Supervisor/ MultiVertex codelets.
std::uint64_t getDynamicSlice1DEstimate(const poplar::Target &target, const poplar::Type &type, const unsigned regionSize, const unsigned numSubElements)
Cycle estimate for Dynamic Slice 1D vertex.
std::uint64_t binaryOpInnerLoopCycles(const poplar::Target &target, const poplar::Type &type, const unsigned cyclesPerVector, const bool vectorize, const unsigned numElems, const std::uint64_t overheadPerLoop)
Cycle cost for processing an arbitrary number of elements given the cycle cost for processing a vecto...
A target representation.
Definition: Target.hpp:69
std::size_t getTypeSize(const Type &) const
Get the size of a given type in bytes.
unsigned getDataPathWidth() const
The width of the load/store data path within the tile.
std::size_t getAtomicStoreGranularity() const
Get the granularity of atomic stores that can be made by independent parallel worker threads.
unsigned getNumWorkerContexts() const
The number of worker contexts per tile.
Class representing device data types.
Definition: Type.hpp:42
Vector
An enumeration used to state what type of pointer is used for a Vector vertex field.
Definition: VectorLayout.hpp:15
Common functions, such as elementwise and reductions.
Definition: AllTrue.hpp:15
Operation
Type of operation to use in a reduction.
Definition: OperationDef.hpp:15
Target parameters used in cast estimation.
Definition: PerformanceEstimation.hpp:139
Cycle estimate for MultiSlice vertex.
Definition: PerformanceEstimation.hpp:97
Cycle estimate for MultiUpdateAdd vertex.
Definition: PerformanceEstimation.hpp:119