Support for gated recurrent units. More...

#include <poplar/Tensor.hpp>
#include <poplin/MatMul.hpp>
#include <popnn/GruDef.hpp>
#include <popnn/NonLinearityDef.hpp>
#include <popnn/Rnn.hpp>

Classes
struct	popnn::gru::GruParams
	Structure representing the parameters of the GRU. More...

struct	popnn::gru::GruWeights
	Structure holding all the parameters of a GRU cell, or the deltas for those parameters (depending on the context). More...

Namespaces
namespace	popnn
	Functions used in neural networks.

Functions
const std::vector< BasicGruCellUnit >	popnn::gru::getDefaultBasicGruCellOrder ()
	Get the default order of the gates in a basic GRU cell. More...

poplar::Tensor	popnn::gru::createInput (poplar::Graph &graph, const GruParams &params, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
	Create an input tensor of shape [`numSteps`, `batchSize`, `inputSize`], that is optimally mapped to multiply the whole input sequence in a single matrix multiply operation. More...

std::pair< poplar::Tensor, poplar::Tensor >	popnn::gru::createWeightsKernel (poplar::Graph &graph, const GruParams &params, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
	Create the weights kernel used to weight the input and output of a GRU. More...

poplar::Tensor	popnn::gru::createWeightsBiases (poplar::Graph &graph, const GruParams &params, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
	Create the weights biases.

GruWeights	popnn::gru::createWeights (poplar::Graph &graph, const GruParams &params, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
	Create the weights (both kernel and biases) used to weight the input and output of a GRU.

poplar::Tensor	popnn::gru::createAttention (poplar::Graph &graph, const GruParams &params, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options={})
	Create an attention tensor for AUGRU.

poplar::Tensor	popnn::gru::gruFwd (poplar::Graph &graph, const GruParams &params, const poplar::Tensor &stateInit, const poplar::Tensor &in, const GruWeights &weights, poplar::Tensor intermediates, poplar::program::Sequence &fwdProg, const poplar::DebugContext &debugContext={}, const poplar::OptionFlags &options={}, poplin::PlanningCache planningCache=nullptr)
	Calculate the result of applying a GRU across a sequence. More...

poplar::Tensor	popnn::gru::gruFwd (poplar::Graph &graph, const GruParams &params, const poplar::Tensor &stateInit, const poplar::Tensor &in, const poplar::Tensor &realTimeSteps, const GruWeights &weights, poplar::Tensor intermediates, poplar::program::Sequence &fwdProg, const poplar::DebugContext &debugContext={}, const poplar::OptionFlags &options={}, poplin::PlanningCache planningCache=nullptr)
	Calculate the result of applying a GRU across a sequence. More...

poplar::Tensor	popnn::gru::auGruFwd (poplar::Graph &graph, const GruParams &params, const poplar::Tensor &stateInit, const poplar::Tensor &in, const GruWeights &weights, poplar::Tensor intermediates, const poplar::Tensor &attScores, poplar::program::Sequence &fwdProg, const poplar::DebugContext &debugContext={}, const poplar::OptionFlags &options={}, poplin::PlanningCache planningCache=nullptr)
	Calculate the result of applying an AUGRU across a sequence. More...

poplar::Tensor	popnn::gru::auGruFwd (poplar::Graph &graph, const GruParams &params, const poplar::Tensor &stateInit, const poplar::Tensor &in, const poplar::Tensor &realTimeSteps, const GruWeights &weights, poplar::Tensor intermediates, const poplar::Tensor &attScores, poplar::program::Sequence &fwdProg, const poplar::DebugContext &debugContext={}, const poplar::OptionFlags &options={}, poplin::PlanningCache planningCache=nullptr)
	Calculate the result of applying an AUGRU across a sequence. More...

poplar::Tensor	popnn::gru::gruBwd (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediatesSeq, const GruWeights &weights, const poplar::Tensor &fwdInputSeq, const poplar::Tensor &fwdOutput, const poplar::Tensor &gradLayerNext, poplar::Tensor inputGrad, poplar::Tensor bwdIntermediates, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
	Run GRU backward pass. More...

poplar::Tensor	popnn::gru::gruBwd (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediatesSeq, const GruWeights &weights, const poplar::Tensor &fwdInputSeq, const poplar::Tensor &realTimeSteps, const poplar::Tensor &fwdOutput, const poplar::Tensor &gradLayerNext, poplar::Tensor inputGrad, poplar::Tensor bwdIntermediates, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
	Run GRU backward pass. More...

poplar::Tensor	popnn::gru::auGruBwd (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediatesSeq, const GruWeights &weights, const poplar::Tensor &fwdInputSeq, const poplar::Tensor &fwdOutput, const poplar::Tensor &gradLayerNext, poplar::Tensor inputGrad, poplar::Tensor bwdIntermediates, const poplar::Tensor &attentions, poplar::Tensor attentionsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache planningCache)
	Run AUGRU backward pass. More...

poplar::Tensor	popnn::gru::auGruBwd (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediatesSeq, const GruWeights &weights, const poplar::Tensor &fwdInputSeq, const poplar::Tensor &realTimeSteps, const poplar::Tensor &fwdOutput, const poplar::Tensor &gradLayerNext, poplar::Tensor inputGrad, poplar::Tensor bwdIntermediates, const poplar::Tensor &attentions, poplar::Tensor attentionsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache planningCache)
	Run AUGRU backward pass. More...

GruWeights	popnn::gru::gruWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const poplar::Tensor &bwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &output, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
	Run a standalone weight update pass. More...

GruWeights	popnn::gru::auGruWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const poplar::Tensor &bwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &output, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
	Run a standalone weight update pass. More...

poplar::Tensor	popnn::gru::gruBwdWithWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &output, const poplar::Tensor &outputGrad, poplar::Tensor inputGrad, GruWeights &weightsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache planningCache)
	Run a combined GRU backward and weight update pass. More...

poplar::Tensor	popnn::gru::gruBwdWithWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &realTimeSteps, const poplar::Tensor &output, const poplar::Tensor &outputGrad, poplar::Tensor inputGrad, GruWeights &weightsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache planningCache)
	Run a combined GRU backward and weight update pass. More...

poplar::Tensor	popnn::gru::auGruBwdWithWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &output, const poplar::Tensor &outputGrad, poplar::Tensor inputGrad, GruWeights &weightsGrad, const poplar::Tensor &attentions, poplar::Tensor attentionsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
	Run a combined AUGRU backward and weight update pass. More...

poplar::Tensor	popnn::gru::auGruBwdWithWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &realTimeSteps, const poplar::Tensor &output, const poplar::Tensor &outputGrad, poplar::Tensor inputGrad, GruWeights &weightsGrad, const poplar::Tensor &attentions, poplar::Tensor attentionsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
	Run a combined AUGRU backward and weight update pass. More...

Detailed Description

Support for gated recurrent units.

Function Documentation

◆ auGruBwd() [1/2]

poplar::Tensor popnn::gru::auGruBwd	(	poplar::Graph &	graph,
		const GruParams &	params,
		poplar::program::Sequence &	prog,
		const poplar::Tensor &	fwdOutputInit,
		const poplar::Tensor &	fwdIntermediatesSeq,
		const GruWeights &	weights,
		const poplar::Tensor &	fwdInputSeq,
		const poplar::Tensor &	fwdOutput,
		const poplar::Tensor &	gradLayerNext,
		poplar::Tensor *	inputGrad,
		poplar::Tensor *	bwdIntermediates,
		const poplar::Tensor &	attentions,
		poplar::Tensor *	attentionsGrad,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options_,
		poplin::PlanningCache *	planningCache
	)

Run AUGRU backward pass.

The backward pass executes in reverse order compared to the forward pass. If the forward steps for an AUGRU layer are sf = {0, 1, 2, ..., S - 1} then the backward steps run for sb = {S - 1, S - 2, .... , 1, 0}.

Parameters

	graph	Graph to which the AUGRU cell belongs.
	params	The parameters of the AUGRU.
	prog	Program sequence.
	fwdOutputInit	Forward output tensor for initial step.
	fwdIntermediatesSeq	Intermediates results from the forward pass.
	weights	The AUGRU weights structure.
	fwdInputSeq	The input tensor to the AUGRU, of shape [`timeSteps`, `batchSize`, `inputSize`]
	fwdOutput	The output tensor from the forward pass. Depending on the `outputFullSequence` parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
	gradLayerNext	The gradients of the output. Depending on the `outputFullSequence` parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]	*inputGrad	The gradients of the inputs - may be null if this information is not required.
[out]	*bwdIntermediates	Intermediates gradients that are retained in the backward pass of training for use in the weight update. It includes the derivatives for reset gate, update gate, and candidate. This argument should be set to null if you do not need to calculate weight deltas.
	attentions	Attentions for each timestep.
[out]	attentionsGrad	Gradients for attentions.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The gradient of the initial output.

◆ auGruBwd() [2/2]

poplar::Tensor popnn::gru::auGruBwd	(	poplar::Graph &	graph,
		const GruParams &	params,
		poplar::program::Sequence &	prog,
		const poplar::Tensor &	fwdOutputInit,
		const poplar::Tensor &	fwdIntermediatesSeq,
		const GruWeights &	weights,
		const poplar::Tensor &	fwdInputSeq,
		const poplar::Tensor &	realTimeSteps,
		const poplar::Tensor &	fwdOutput,
		const poplar::Tensor &	gradLayerNext,
		poplar::Tensor *	inputGrad,
		poplar::Tensor *	bwdIntermediates,
		const poplar::Tensor &	attentions,
		poplar::Tensor *	attentionsGrad,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options_,
		poplin::PlanningCache *	planningCache
	)

Run AUGRU backward pass.

Deprecated:: Use previously defined auGruBwd() instead.

The backward pass executes in reverse order compared to the forward pass. If the forward steps for an AUGRU layer are sf = {0, 1, 2, ..., S - 1} then the backward steps run for sb = {S - 1, S - 2, .... , 1, 0}.

Parameters

	graph	Graph to which the AUGRU cell belongs.
	params	The parameters of the AUGRU.
	prog	Program sequence.
	fwdOutputInit	Forward output tensor for initial step.
	fwdIntermediatesSeq	Intermediates results from the forward pass.
	weights	The AUGRU weights structure.
	fwdInputSeq	The input tensor to the AUGRU, of shape [`timeSteps`, `batchSize`, `inputSize`]
	realTimeSteps	A tensor containing real timesteps for each sequence, of shape [`batch`].
	fwdOutput	The output tensor from the forward pass. Depending on the `outputFullSequence` parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
	gradLayerNext	The gradients of the output. Depending on the `outputFullSequence` parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]	*inputGrad	The gradients of the inputs - may be null if this information is not required.
[out]	*bwdIntermediates	Intermediates gradients that are retained in the backward pass of training for use in the weight update. It includes the derivatives for reset gate, update gate, and candidate. This argument should be set to null if you do not need to calculate weight deltas.
	attentions	Attentions for each timestep.
[out]	attentionsGrad	Gradients for attentions.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The gradient of the initial output.

◆ auGruBwdWithWU() [1/2]

poplar::Tensor popnn::gru::auGruBwdWithWU	(	poplar::Graph &	graph,
		const GruParams &	params,
		poplar::program::Sequence &	prog,
		const poplar::Tensor &	fwdOutputInit,
		const poplar::Tensor &	fwdIntermediates,
		const GruWeights &	weights,
		const poplar::Tensor &	input,
		const poplar::Tensor &	output,
		const poplar::Tensor &	outputGrad,
		poplar::Tensor *	inputGrad,
		GruWeights &	weightsGrad,
		const poplar::Tensor &	attentions,
		poplar::Tensor *	attentionsGrad,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options_,
		poplin::PlanningCache *	planningCache
	)

Run a combined AUGRU backward and weight update pass.

Use this combined backward and weight update pass in preference to auGruBwd() and auGruWU() separately in order to allow the most efficient implementation to be chosen if you do not need to split the operation.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters

	graph	Graph to which the GRU cell belongs.
	params	The parameters of the GRU.
	prog	Program sequence.
	fwdOutputInit	Forward output tensor for initial step.
	fwdIntermediates	Intermediates results from the forward pass.
	weights	The GRU weights structure.
	input	The input tensor to the GRU, of shape [`timeSteps`, `batchSize`, `inputSize`]
	output	The output tensor from the forward pass. Depending on the `outputFullSequence` parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
	outputGrad	The gradients of the output. Depending on the `outputFullSequence` parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]	*inputGrad	The gradients of the inputs - may be null if this information is not required.
	weightsGrad	A set of weight deltas to sum with weights.
	attentions	Attention for each timestep.
[out]	attentionsGrad	Gradients for attentions.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The gradient of the initial output.

◆ auGruBwdWithWU() [2/2]

poplar::Tensor popnn::gru::auGruBwdWithWU	(	poplar::Graph &	graph,
		const GruParams &	params,
		poplar::program::Sequence &	prog,
		const poplar::Tensor &	fwdOutputInit,
		const poplar::Tensor &	fwdIntermediates,
		const GruWeights &	weights,
		const poplar::Tensor &	input,
		const poplar::Tensor &	realTimeSteps,
		const poplar::Tensor &	output,
		const poplar::Tensor &	outputGrad,
		poplar::Tensor *	inputGrad,
		GruWeights &	weightsGrad,
		const poplar::Tensor &	attentions,
		poplar::Tensor *	attentionsGrad,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options_,
		poplin::PlanningCache *	planningCache
	)

Run a combined AUGRU backward and weight update pass.

Deprecated:: Use previously defined auGruBwdWithWU() instead.

Use this combined backward and weight update pass in preference to auGruBwd() and auGruWU() separately in order to allow the most efficient implementation to be chosen if you do not need to split the operation.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters

	graph	Graph to which the GRU cell belongs.
	params	The parameters of the GRU.
	prog	Program sequence.
	fwdOutputInit	Forward output tensor for initial step.
	fwdIntermediates	Intermediates results from the forward pass.
	weights	The GRU weights structure.
	input	The input tensor to the GRU, of shape [`timeSteps`, `batchSize`, `inputSize`].
	realTimeSteps	A tensor containing real timesteps for each sequence, of shape [`batch`].
	output	The output tensor from the forward pass. Depending on the `outputFullSequence` parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
	outputGrad	The gradients of the output. Depending on the `outputFullSequence` parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]	*inputGrad	The gradients of the inputs - may be null if this information is not required.
	weightsGrad	A set of weight deltas to sum with weights.
	attentions	Attention for each timestep.
[out]	attentionsGrad	Gradients for attentions.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The gradient of the initial output.

◆ auGruFwd() [1/2]

poplar::Tensor popnn::gru::auGruFwd	(	poplar::Graph &	graph,
		const GruParams &	params,
		const poplar::Tensor &	stateInit,
		const poplar::Tensor &	in,
		const GruWeights &	weights,
		poplar::Tensor *	intermediates,
		const poplar::Tensor &	attScores,
		poplar::program::Sequence &	fwdProg,
		const poplar::DebugContext &	debugContext = `{}`,
		const poplar::OptionFlags &	options = `{}`,
		poplin::PlanningCache *	planningCache = `nullptr`
	)

Calculate the result of applying an AUGRU across a sequence.

The formulas for a AUGRU cell are:

$r_t = sigmod(w_r \times x_t + u_r \times h_{t-1} + b_r)$
$u_t = sigmod(w_u \times x_t + u_u \times h_{t-1} + b_u)$
$c_t = tanh(w_c \times x_t + u_c \times (r_t \circ h_{t-1}) + b_c)$
$u_t = (1 - a_t) \times u_t$
$h_t = u_t \circ h_{t-1} + (1 - u_t) \circ c_t$

Where:

$\times$ is matrix multiplication
$\circ$ is Hadamard product
is a scalar

The AUGRU is run for rnn::RnnParams.maxTimeSteps, each with a batch of size batchSize and input size inputSize and output size outputSize. The total number of units within each AUGRU cell is BASIC_GRU_CELL_NUM_UNITS.

Parameters

	graph	Graph to which the AUGRU cell belongs.
	params	The parameters of the AUGRU.
	stateInit	Initial state for the AUGRU.
	in	The input tensor to the AUGRU of dimension [`timeSteps`, `batchSize`, `inputSize`].
	weights	The AUGRU weights structure.
[out]	intermediates	Intermediate results that are retained in the forward pass of training for use in the backward pass. It includes the data for reset gate, update gate, candidate, and output if `outputFullSequence` is false. This argument should be set to null if we are only doing inference.
	attScores	Attention for each timestep.
	fwdProg	Program sequence.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The output of the GRU. Depending on the outputFullSequence parameter the output tensor is either the output of the last timestep in the shape [batchSize, outputSize] or it is the sequence of outputs for every timestep in the shape [timeSteps, batchSize, outputSize].

◆ auGruFwd() [2/2]

poplar::Tensor popnn::gru::auGruFwd	(	poplar::Graph &	graph,
		const GruParams &	params,
		const poplar::Tensor &	stateInit,
		const poplar::Tensor &	in,
		const poplar::Tensor &	realTimeSteps,
		const GruWeights &	weights,
		poplar::Tensor *	intermediates,
		const poplar::Tensor &	attScores,
		poplar::program::Sequence &	fwdProg,
		const poplar::DebugContext &	debugContext = `{}`,
		const poplar::OptionFlags &	options = `{}`,
		poplin::PlanningCache *	planningCache = `nullptr`
	)

Calculate the result of applying an AUGRU across a sequence.

Deprecated:: Use previously defined auGruFwd() instead.

The formulas for a AUGRU cell are:

$r_t = sigmod(w_r \times x_t + u_r \times h_{t-1} + b_r)$
$u_t = sigmod(w_u \times x_t + u_u \times h_{t-1} + b_u)$
$c_t = tanh(w_c \times x_t + u_c \times (r_t \circ h_{t-1}) + b_c)$
$u_t = (1 - a_t) \times u_t$
$h_t = u_t \circ h_{t-1} + (1 - u_t) \circ c_t$

Where:

$\times$ is matrix multiplication
$\circ$ is Hadamard product
is a scalar

The AUGRU is run for rnn::RnnParams.maxTimeSteps, each with a batch of size batchSize and input size inputSize and output size outputSize. The total number of units within each AUGRU cell is BASIC_GRU_CELL_NUM_UNITS.

Parameters

	graph	Graph to which the AUGRU cell belongs.
	params	The parameters of the AUGRU.
	stateInit	Initial state for the AUGRU.
	in	The input tensor to the AUGRU of dimension [`timeSteps`, `batchSize`, `inputSize`].
	realTimeSteps	A tensor containing real timesteps for each sequence, of shape [`batch`].
	weights	The AUGRU weights structure.
[out]	intermediates	Intermediate results that are retained in the forward pass of training for use in the backward pass. It includes the data for reset gate, update gate, candidate, and output if `outputFullSequence` is false. This argument should be set to null if we are only doing inference.
	attScores	Attention for each timestep.
	fwdProg	Program sequence.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The output of the GRU. Depending on the outputFullSequence parameter the output tensor is either the output of the last timestep in the shape [batchSize, outputSize] or it is the sequence of outputs for every timestep in the shape [timeSteps, batchSize, outputSize].

◆ auGruWU()

GruWeights popnn::gru::auGruWU	(	poplar::Graph &	graph,
		const GruParams &	params,
		poplar::program::Sequence &	prog,
		const poplar::Tensor &	fwdOutputInit,
		const poplar::Tensor &	fwdIntermediates,
		const poplar::Tensor &	bwdIntermediates,
		const GruWeights &	weights,
		const poplar::Tensor &	input,
		const poplar::Tensor &	output,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options_,
		poplin::PlanningCache *	planningCache
	)

Run a standalone weight update pass.

Takes intermediates and gradients from the backward pass and calculates and returns weight deltas.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters

graph	Graph to which the GRU cell belongs.
params	The parameters of the GRU.
prog	Program sequence to add operations to.
fwdOutputInit	Forward output tensor for initial step.
fwdIntermediates	Intermediate results from the forward pass.
bwdIntermediates	Intermediate results from the backward pass.
weights	The GRU weights structure.
input	The input tensor to the GRU, of shape [`timeSteps`, `batchSize`, `inputSize`]
output	The output tensor from the forward pass. Depending on the `outputFullSequence` parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
debugContext	Optional debug information.
options	GRU implementation options. See createInput().
planningCache	The matmul planning cache.

Returns: A set of weight gradients to sum with weights.

◆ createInput()

poplar::Tensor popnn::gru::createInput	(	poplar::Graph &	graph,
		const GruParams &	params,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options = `{}`,
		poplin::PlanningCache *	planningCache = `nullptr`
	)

Create an input tensor of shape [numSteps, batchSize, inputSize], that is optimally mapped to multiply the whole input sequence in a single matrix multiply operation.

GRU options

availableMemoryProportion Decimal between 0 and 1 (inclusive).

See poplin::createWeights() for more information.
inferenceOnly (true, false) [=true]

Sets convolution pass to INFERENCE_FWD if true; TRAINING_FWD otherwise. See the pass option in poplin::createWeights().
partialsType (half, float) [=float]

See poplin::createWeights() for more information.

Parameters

graph	Graph object to add the tensor to.
params	The GRU parameters.
debugContext	Optional debug information.
options	Any implementation/debug options for the GRU.
planningCache	A poplin matrix multiply planning cache.

Returns: A tensor created in the graph of shape [timeSteps, batchSize, inputSize].

◆ createWeightsKernel()

std::pair< poplar::Tensor, poplar::Tensor > popnn::gru::createWeightsKernel	(	poplar::Graph &	graph,
		const GruParams &	params,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options = `{}`,
		poplin::PlanningCache *	planningCache = `nullptr`
	)

Create the weights kernel used to weight the input and output of a GRU.

Returns the inputWeights and outputWeights.

◆ getDefaultBasicGruCellOrder()

const std::vector< BasicGruCellUnit > popnn::gru::getDefaultBasicGruCellOrder ( )

Get the default order of the gates in a basic GRU cell.

The default order is: [Reset gate, Update gate, Candidate].

◆ gruBwd() [1/2]

poplar::Tensor popnn::gru::gruBwd	(	poplar::Graph &	graph,
		const GruParams &	params,
		poplar::program::Sequence &	prog,
		const poplar::Tensor &	fwdOutputInit,
		const poplar::Tensor &	fwdIntermediatesSeq,
		const GruWeights &	weights,
		const poplar::Tensor &	fwdInputSeq,
		const poplar::Tensor &	fwdOutput,
		const poplar::Tensor &	gradLayerNext,
		poplar::Tensor *	inputGrad,
		poplar::Tensor *	bwdIntermediates,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options_,
		poplin::PlanningCache *	planningCache
	)

Run GRU backward pass.

The backward pass executes in reverse order compared to the forward pass. If the forward steps for a GRU layer are sf = {0, 1, 2, ..., S - 1} then the backward steps run for sb = {S - 1, S - 2, .... , 1, 0}.

Parameters

	graph	Graph to which the GRU cell belongs.
	params	The parameters of the GRU.
	prog	Program sequence.
	fwdOutputInit	Forward output tensor for initial step.
	fwdIntermediatesSeq	Intermediates results from the forward pass.
	weights	The GRU weights structure.
	fwdInputSeq	The input tensor to the GRU, of shape [`timeSteps`, `batchSize`, `inputSize`]
	fwdOutput	The output tensor from the forward pass. Depending on the `outputFullSequence` parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
	gradLayerNext	The gradients of the output. Depending on the `outputFullSequence` parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]	*inputGrad	The gradients of the inputs - may be null if this information is not required.
[out]	*bwdIntermediates	Intermediates gradients that are retained in the backward pass of training for use in the weight update. It includes the derivatives for reset gate, update gate, and candidate. This argument should be set to null if you do not need to calculate weight deltas.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The gradient of the initial output.

◆ gruBwd() [2/2]

poplar::Tensor popnn::gru::gruBwd	(	poplar::Graph &	graph,
		const GruParams &	params,
		poplar::program::Sequence &	prog,
		const poplar::Tensor &	fwdOutputInit,
		const poplar::Tensor &	fwdIntermediatesSeq,
		const GruWeights &	weights,
		const poplar::Tensor &	fwdInputSeq,
		const poplar::Tensor &	realTimeSteps,
		const poplar::Tensor &	fwdOutput,
		const poplar::Tensor &	gradLayerNext,
		poplar::Tensor *	inputGrad,
		poplar::Tensor *	bwdIntermediates,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options_,
		poplin::PlanningCache *	planningCache
	)

Run GRU backward pass.

Deprecated:: Use previously defined popnn::gruBwd() instead.

The backward pass executes in reverse order compared to the forward pass. If the forward steps for a GRU layer are sf = {0, 1, 2, ..., S - 1} then the backward steps run for sb = {S - 1, S - 2, .... , 1, 0}.

Parameters

	graph	Graph to which the GRU cell belongs.
	params	The parameters of the GRU.
	prog	Program sequence.
	fwdOutputInit	Forward output tensor for initial step.
	fwdIntermediatesSeq	Intermediates results from the forward pass.
	weights	The GRU weights structure.
	realTimeSteps	A tensor containing real timesteps for each sequence, with shape [`batch`].
	fwdInputSeq	The input tensor to the GRU, of shape [`timeSteps`, `batchSize`, `inputSize`]
	fwdOutput	The output tensor from the forward pass. Depending on the `outputFullSequence` parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
	gradLayerNext	The gradients of the output. Depending on the `outputFullSequence` parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]	*inputGrad	The gradients of the inputs - may be null if this information is not required.
[out]	*bwdIntermediates	Intermediates gradients that are retained in the backward pass of training for use in the weight update. It includes the derivatives for reset gate, update gate, and candidate. This argument should be set to null if you do not need to calculate weight deltas.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The gradient of the initial output.

◆ gruBwdWithWU() [1/2]

poplar::Tensor popnn::gru::gruBwdWithWU	(	poplar::Graph &	graph,
		const GruParams &	params,
		poplar::program::Sequence &	prog,
		const poplar::Tensor &	fwdOutputInit,
		const poplar::Tensor &	fwdIntermediates,
		const GruWeights &	weights,
		const poplar::Tensor &	input,
		const poplar::Tensor &	output,
		const poplar::Tensor &	outputGrad,
		poplar::Tensor *	inputGrad,
		GruWeights &	weightsGrad,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options_,
		poplin::PlanningCache *	planningCache
	)

Run a combined GRU backward and weight update pass.

Use this combined backward and weight update pass in preference to gruBwd() and gruWU() separately in order to allow the most efficient implementation to be chosen if you do not need to split the operation.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters

	graph	Graph to which the GRU cell belongs.
	params	The parameters of the GRU.
	prog	Program sequence.
	fwdOutputInit	Forward output tensor for initial step.
	fwdIntermediates	Intermediates results from the forward pass.
	weights	The GRU weights structure.
	input	The input tensor to the GRU, of shape [`timeSteps`, `batchSize`, `inputSize`]
	output	The output tensor from the forward pass. Depending on the `outputFullSequence` parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
	outputGrad	The gradients of the output. Depending on the `outputFullSequence` parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]	*inputGrad	The gradients of the inputs - may be null if this information is not required.
	weightsGrad	A set of weight deltas to sum with weights.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The gradient of the initial output.

◆ gruBwdWithWU() [2/2]

poplar::Tensor popnn::gru::gruBwdWithWU	(	poplar::Graph &	graph,
		const GruParams &	params,
		poplar::program::Sequence &	prog,
		const poplar::Tensor &	fwdOutputInit,
		const poplar::Tensor &	fwdIntermediates,
		const GruWeights &	weights,
		const poplar::Tensor &	input,
		const poplar::Tensor &	realTimeSteps,
		const poplar::Tensor &	output,
		const poplar::Tensor &	outputGrad,
		poplar::Tensor *	inputGrad,
		GruWeights &	weightsGrad,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options_,
		poplin::PlanningCache *	planningCache
	)

Run a combined GRU backward and weight update pass.

Deprecated:: Use previously defined gruBwdWithWU() instead.

Use this combined backward and weight update pass in preference to gruBwd() and gruWU() separately in order to allow the most efficient implementation to be chosen if you do not need to split the operation.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters

	graph	Graph to which the GRU cell belongs.
	params	The parameters of the GRU.
	prog	Program sequence.
	fwdOutputInit	Forward output tensor for initial step.
	fwdIntermediates	Intermediates results from the forward pass.
	weights	The GRU weights structure.
	input	The input tensor to the GRU, of shape [`timeSteps`, `batchSize`, `inputSize`]
	realTimeSteps	A tensor containing real timesteps for each sequence, of shape [`batch`].
	output	The output tensor from the forward pass. Depending on the `outputFullSequence` parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
	outputGrad	The gradients of the output. Depending on the `outputFullSequence` parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]	*inputGrad	The gradients of the inputs - may be null if this information is not required.
	weightsGrad	A set of weight deltas to sum with weights.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The gradient of the initial output.

◆ gruFwd() [1/2]

poplar::Tensor popnn::gru::gruFwd	(	poplar::Graph &	graph,
		const GruParams &	params,
		const poplar::Tensor &	stateInit,
		const poplar::Tensor &	in,
		const GruWeights &	weights,
		poplar::Tensor *	intermediates,
		poplar::program::Sequence &	fwdProg,
		const poplar::DebugContext &	debugContext = `{}`,
		const poplar::OptionFlags &	options = `{}`,
		poplin::PlanningCache *	planningCache = `nullptr`
	)

Calculate the result of applying a GRU across a sequence.

The formulas for a GRU cell are:

$r_t = \operatorname{sigmoid}(w_r \times x_t + u_r \times h_{t-1} + b_r)$
$u_t = \operatorname{sigmoid}(w_u \times x_t + u_u \times h_{t-1} + b_u)$
$c_t = \tanh(w_c \times x_t + u_c \times (r_t \circ h_{t-1}) + b_c)$
$h_t = u_t \circ h_{t-1} + (1 - u_t) \circ c_t$

Where:

$\times$ is matrix multiplication
$\circ$ is Hadamard product

The GRU is run for rnn::RnnParams.maxTimeSteps, each with a batch of size batchSize and input size inputSize and output size outputSize. The total number of units within each GRU cell is BASIC_GRU_CELL_NUM_UNITS.

Parameters

	graph	Graph to which the GRU cell belongs.
	params	The parameters of the GRU.
	stateInit	Initial state for the GRU.
	in	The input tensor to the GRU of dimension [`timeSteps`, `batchSize`, `inputSize`].
	weights	The GRU weights structure.
[out]	intermediates	Intermediate results that are retained in the forward pass of training for use in the backward pass. It includes the data for reset gate, update gate, candidate, and output if `outputFullSequence` is false. This argument should be set to null if we are only doing inference.
	fwdProg	Program sequence.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The output of the GRU. Depending on the outputFullSequence parameter the output tensor is either the output of the last timestep in the shape [batchSize, outputSize] or it is the sequence of outputs for every timestep in the shape [timeSteps, batchSize, outputSize].

◆ gruFwd() [2/2]

poplar::Tensor popnn::gru::gruFwd	(	poplar::Graph &	graph,
		const GruParams &	params,
		const poplar::Tensor &	stateInit,
		const poplar::Tensor &	in,
		const poplar::Tensor &	realTimeSteps,
		const GruWeights &	weights,
		poplar::Tensor *	intermediates,
		poplar::program::Sequence &	fwdProg,
		const poplar::DebugContext &	debugContext = `{}`,
		const poplar::OptionFlags &	options = `{}`,
		poplin::PlanningCache *	planningCache = `nullptr`
	)

Calculate the result of applying a GRU across a sequence.

Deprecated:: Use previously defined gruFwd() instead.

The formulas for a GRU cell are:

$r_t = \operatorname{sigmoid}(w_r \times x_t + u_r \times h_{t-1} + b_r)$
$u_t = \operatorname{sigmoid}(w_u \times x_t + u_u \times h_{t-1} + b_u)$
$c_t = \tanh(w_c \times x_t + u_c \times (r_t \circ h_{t-1}) + b_c)$
$h_t = u_t \circ h_{t-1} + (1 - u_t) \circ c_t$

Where:

$\times$ is matrix multiplication
$\circ$ is Hadamard product

The GRU is run for rnn::RnnParams.maxTimeSteps, each with a batch of size batchSize and input size inputSize and output size outputSize. The total number of units within each GRU cell is BASIC_GRU_CELL_NUM_UNITS.

Parameters

	graph	Graph to which the GRU cell belongs.
	params	The parameters of the GRU.
	stateInit	Initial state for the GRU.
	in	The input tensor to the GRU of dimension [`timeSteps`, `batchSize`, `inputSize`].
	realTimeSteps	A tensor containing real timesteps for each sequence, of shape [`batch`].
	weights	The GRU weights structure.
[out]	intermediates	Intermediate results that are retained in the forward pass of training for use in the backward pass. It includes the data for reset gate, update gate, candidate, and output if `outputFullSequence` is false. This argument should be set to null if we are only doing inference.
	fwdProg	Program sequence.
	debugContext	Optional debug information.
	options	GRU implementation options. See createInput().
	planningCache	The matmul planning cache.

Returns: The output of the GRU. Depending on the outputFullSequence parameter the output tensor is either the output of the last timestep in the shape [batchSize, outputSize] or it is the sequence of outputs for every timestep in the shape [timeSteps, batchSize, outputSize].

◆ gruWU()

GruWeights popnn::gru::gruWU	(	poplar::Graph &	graph,
		const GruParams &	params,
		poplar::program::Sequence &	prog,
		const poplar::Tensor &	fwdOutputInit,
		const poplar::Tensor &	fwdIntermediates,
		const poplar::Tensor &	bwdIntermediates,
		const GruWeights &	weights,
		const poplar::Tensor &	input,
		const poplar::Tensor &	output,
		const poplar::DebugContext &	debugContext,
		const poplar::OptionFlags &	options_,
		poplin::PlanningCache *	planningCache
	)

Run a standalone weight update pass.

Takes intermediates and gradients from the backward pass and calculates and returns weight deltas.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters

graph	Graph to which the GRU cell belongs.
params	The parameters of the GRU.
prog	Program sequence to add operations to.
fwdOutputInit	Forward output tensor for initial step.
fwdIntermediates	Intermediate results from the forward pass.
bwdIntermediates	Intermediate results from the backward pass.
weights	The GRU weights structure.
input	The input tensor to the GRU, of shape [`timeSteps`, `batchSize`, `inputSize`]
output	The output tensor from the forward pass. Depending on the `outputFullSequence` parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
debugContext	Optional debug information.
options	GRU implementation options. See createInput().
planningCache	The matmul planning cache.

Returns: A set of weight gradients to sum with weights.

Classes

Namespaces

Functions

Detailed Description

Function Documentation

◆ auGruBwd() [1/2]

◆ auGruBwd() [2/2]

◆ auGruBwdWithWU() [1/2]

◆ auGruBwdWithWU() [2/2]

◆ auGruFwd() [1/2]

◆ auGruFwd() [2/2]

◆ auGruWU()

◆ createInput()

◆ createWeightsKernel()

◆ getDefaultBasicGruCellOrder()

◆ gruBwd() [1/2]

◆ gruBwd() [2/2]

◆ gruBwdWithWU() [1/2]

◆ gruBwdWithWU() [2/2]

◆ gruFwd() [1/2]

◆ gruFwd() [2/2]

◆ gruWU()