Poplar and PopLibs
Gru.hpp File Reference

Support for gated recurrent units. More...

#include <poplar/Tensor.hpp>
#include <poplin/MatMul.hpp>
#include <popnn/GruDef.hpp>
#include <popnn/NonLinearityDef.hpp>
#include <popnn/Rnn.hpp>

Go to the source code of this file.

Classes

struct  popnn::gru::GruParams
 Structure representing the parameters of the GRU. More...
 
struct  popnn::gru::GruWeights
 Structure holding all the parameters of a GRU cell, or the deltas for those parameters (depending on the context). More...
 

Namespaces

namespace  popnn
 Functions used in neural networks.
 

Functions

const std::vector< BasicGruCellUnitpopnn::gru::getDefaultBasicGruCellOrder ()
 Get the default order of the gates in a basic GRU cell. More...
 
poplar::Tensor popnn::gru::createInput (poplar::Graph &graph, const GruParams &params, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
 Create an input tensor of shape [numSteps, batchSize, inputSize], that is optimally mapped to multiply the whole input sequence in a single matrix multiply operation. More...
 
std::pair< poplar::Tensor, poplar::Tensorpopnn::gru::createWeightsKernel (poplar::Graph &graph, const GruParams &params, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
 Create the weights kernel used to weight the input and output of a GRU. More...
 
poplar::Tensor popnn::gru::createWeightsBiases (poplar::Graph &graph, const GruParams &params, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
 Create the weights biases.
 
GruWeights popnn::gru::createWeights (poplar::Graph &graph, const GruParams &params, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
 Create the weights (both kernel and biases) used to weight the input and output of a GRU.
 
poplar::Tensor popnn::gru::createAttention (poplar::Graph &graph, const GruParams &params, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options={})
 Create an attention tensor for AUGRU.
 
poplar::Tensor popnn::gru::gruFwd (poplar::Graph &graph, const GruParams &params, const poplar::Tensor &stateInit, const poplar::Tensor &in, const GruWeights &weights, poplar::Tensor *intermediates, poplar::program::Sequence &fwdProg, const poplar::DebugContext &debugContext={}, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
 Calculate the result of applying a GRU across a sequence. More...
 
poplar::Tensor popnn::gru::gruFwd (poplar::Graph &graph, const GruParams &params, const poplar::Tensor &stateInit, const poplar::Tensor &in, const poplar::Tensor &realTimeSteps, const GruWeights &weights, poplar::Tensor *intermediates, poplar::program::Sequence &fwdProg, const poplar::DebugContext &debugContext={}, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
 Calculate the result of applying a GRU across a sequence. More...
 
poplar::Tensor popnn::gru::auGruFwd (poplar::Graph &graph, const GruParams &params, const poplar::Tensor &stateInit, const poplar::Tensor &in, const GruWeights &weights, poplar::Tensor *intermediates, const poplar::Tensor &attScores, poplar::program::Sequence &fwdProg, const poplar::DebugContext &debugContext={}, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
 Calculate the result of applying an AUGRU across a sequence. More...
 
poplar::Tensor popnn::gru::auGruFwd (poplar::Graph &graph, const GruParams &params, const poplar::Tensor &stateInit, const poplar::Tensor &in, const poplar::Tensor &realTimeSteps, const GruWeights &weights, poplar::Tensor *intermediates, const poplar::Tensor &attScores, poplar::program::Sequence &fwdProg, const poplar::DebugContext &debugContext={}, const poplar::OptionFlags &options={}, poplin::PlanningCache *planningCache=nullptr)
 Calculate the result of applying an AUGRU across a sequence. More...
 
poplar::Tensor popnn::gru::gruBwd (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediatesSeq, const GruWeights &weights, const poplar::Tensor &fwdInputSeq, const poplar::Tensor &fwdOutput, const poplar::Tensor &gradLayerNext, poplar::Tensor *inputGrad, poplar::Tensor *bwdIntermediates, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
 Run GRU backward pass. More...
 
poplar::Tensor popnn::gru::gruBwd (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediatesSeq, const GruWeights &weights, const poplar::Tensor &fwdInputSeq, const poplar::Tensor &realTimeSteps, const poplar::Tensor &fwdOutput, const poplar::Tensor &gradLayerNext, poplar::Tensor *inputGrad, poplar::Tensor *bwdIntermediates, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
 Run GRU backward pass. More...
 
poplar::Tensor popnn::gru::auGruBwd (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediatesSeq, const GruWeights &weights, const poplar::Tensor &fwdInputSeq, const poplar::Tensor &fwdOutput, const poplar::Tensor &gradLayerNext, poplar::Tensor *inputGrad, poplar::Tensor *bwdIntermediates, const poplar::Tensor &attentions, poplar::Tensor *attentionsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
 Run AUGRU backward pass. More...
 
poplar::Tensor popnn::gru::auGruBwd (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediatesSeq, const GruWeights &weights, const poplar::Tensor &fwdInputSeq, const poplar::Tensor &realTimeSteps, const poplar::Tensor &fwdOutput, const poplar::Tensor &gradLayerNext, poplar::Tensor *inputGrad, poplar::Tensor *bwdIntermediates, const poplar::Tensor &attentions, poplar::Tensor *attentionsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
 Run AUGRU backward pass. More...
 
GruWeights popnn::gru::gruWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const poplar::Tensor &bwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &output, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
 Run a standalone weight update pass. More...
 
GruWeights popnn::gru::auGruWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const poplar::Tensor &bwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &output, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
 Run a standalone weight update pass. More...
 
poplar::Tensor popnn::gru::gruBwdWithWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &output, const poplar::Tensor &outputGrad, poplar::Tensor *inputGrad, GruWeights &weightsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
 Run a combined GRU backward and weight update pass. More...
 
poplar::Tensor popnn::gru::gruBwdWithWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &realTimeSteps, const poplar::Tensor &output, const poplar::Tensor &outputGrad, poplar::Tensor *inputGrad, GruWeights &weightsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
 Run a combined GRU backward and weight update pass. More...
 
poplar::Tensor popnn::gru::auGruBwdWithWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &output, const poplar::Tensor &outputGrad, poplar::Tensor *inputGrad, GruWeights &weightsGrad, const poplar::Tensor &attentions, poplar::Tensor *attentionsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
 Run a combined AUGRU backward and weight update pass. More...
 
poplar::Tensor popnn::gru::auGruBwdWithWU (poplar::Graph &graph, const GruParams &params, poplar::program::Sequence &prog, const poplar::Tensor &fwdOutputInit, const poplar::Tensor &fwdIntermediates, const GruWeights &weights, const poplar::Tensor &input, const poplar::Tensor &realTimeSteps, const poplar::Tensor &output, const poplar::Tensor &outputGrad, poplar::Tensor *inputGrad, GruWeights &weightsGrad, const poplar::Tensor &attentions, poplar::Tensor *attentionsGrad, const poplar::DebugContext &debugContext, const poplar::OptionFlags &options_, poplin::PlanningCache *planningCache)
 Run a combined AUGRU backward and weight update pass. More...
 

Detailed Description

Support for gated recurrent units.

Function Documentation

◆ auGruBwd() [1/2]

poplar::Tensor popnn::gru::auGruBwd ( poplar::Graph graph,
const GruParams params,
poplar::program::Sequence prog,
const poplar::Tensor fwdOutputInit,
const poplar::Tensor fwdIntermediatesSeq,
const GruWeights weights,
const poplar::Tensor fwdInputSeq,
const poplar::Tensor fwdOutput,
const poplar::Tensor gradLayerNext,
poplar::Tensor inputGrad,
poplar::Tensor bwdIntermediates,
const poplar::Tensor attentions,
poplar::Tensor attentionsGrad,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options_,
poplin::PlanningCache *  planningCache 
)

Run AUGRU backward pass.

The backward pass executes in reverse order compared to the forward pass. If the forward steps for an AUGRU layer are sf = {0, 1, 2, ..., S - 1} then the backward steps run for sb = {S - 1, S - 2, .... , 1, 0}.

Parameters
graphGraph to which the AUGRU cell belongs.
paramsThe parameters of the AUGRU.
progProgram sequence.
fwdOutputInitForward output tensor for initial step.
fwdIntermediatesSeqIntermediates results from the forward pass.
weightsThe AUGRU weights structure.
fwdInputSeqThe input tensor to the AUGRU, of shape [timeSteps, batchSize, inputSize]
fwdOutputThe output tensor from the forward pass. Depending on the outputFullSequence parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
gradLayerNextThe gradients of the output. Depending on the outputFullSequence parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]*inputGradThe gradients of the inputs - may be null if this information is not required.
[out]*bwdIntermediatesIntermediates gradients that are retained in the backward pass of training for use in the weight update. It includes the derivatives for reset gate, update gate, and candidate. This argument should be set to null if you do not need to calculate weight deltas.
attentionsAttentions for each timestep.
[out]attentionsGradGradients for attentions.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The gradient of the initial output.

◆ auGruBwd() [2/2]

poplar::Tensor popnn::gru::auGruBwd ( poplar::Graph graph,
const GruParams params,
poplar::program::Sequence prog,
const poplar::Tensor fwdOutputInit,
const poplar::Tensor fwdIntermediatesSeq,
const GruWeights weights,
const poplar::Tensor fwdInputSeq,
const poplar::Tensor realTimeSteps,
const poplar::Tensor fwdOutput,
const poplar::Tensor gradLayerNext,
poplar::Tensor inputGrad,
poplar::Tensor bwdIntermediates,
const poplar::Tensor attentions,
poplar::Tensor attentionsGrad,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options_,
poplin::PlanningCache *  planningCache 
)

Run AUGRU backward pass.

Deprecated:
Use previously defined auGruBwd() instead.

The backward pass executes in reverse order compared to the forward pass. If the forward steps for an AUGRU layer are sf = {0, 1, 2, ..., S - 1} then the backward steps run for sb = {S - 1, S - 2, .... , 1, 0}.

Parameters
graphGraph to which the AUGRU cell belongs.
paramsThe parameters of the AUGRU.
progProgram sequence.
fwdOutputInitForward output tensor for initial step.
fwdIntermediatesSeqIntermediates results from the forward pass.
weightsThe AUGRU weights structure.
fwdInputSeqThe input tensor to the AUGRU, of shape [timeSteps, batchSize, inputSize]
realTimeStepsA tensor containing real timesteps for each sequence, of shape [batch].
fwdOutputThe output tensor from the forward pass. Depending on the outputFullSequence parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
gradLayerNextThe gradients of the output. Depending on the outputFullSequence parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]*inputGradThe gradients of the inputs - may be null if this information is not required.
[out]*bwdIntermediatesIntermediates gradients that are retained in the backward pass of training for use in the weight update. It includes the derivatives for reset gate, update gate, and candidate. This argument should be set to null if you do not need to calculate weight deltas.
attentionsAttentions for each timestep.
[out]attentionsGradGradients for attentions.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The gradient of the initial output.

◆ auGruBwdWithWU() [1/2]

poplar::Tensor popnn::gru::auGruBwdWithWU ( poplar::Graph graph,
const GruParams params,
poplar::program::Sequence prog,
const poplar::Tensor fwdOutputInit,
const poplar::Tensor fwdIntermediates,
const GruWeights weights,
const poplar::Tensor input,
const poplar::Tensor output,
const poplar::Tensor outputGrad,
poplar::Tensor inputGrad,
GruWeights weightsGrad,
const poplar::Tensor attentions,
poplar::Tensor attentionsGrad,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options_,
poplin::PlanningCache *  planningCache 
)

Run a combined AUGRU backward and weight update pass.

Use this combined backward and weight update pass in preference to auGruBwd() and auGruWU() separately in order to allow the most efficient implementation to be chosen if you do not need to split the operation.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters
graphGraph to which the GRU cell belongs.
paramsThe parameters of the GRU.
progProgram sequence.
fwdOutputInitForward output tensor for initial step.
fwdIntermediatesIntermediates results from the forward pass.
weightsThe GRU weights structure.
inputThe input tensor to the GRU, of shape [timeSteps, batchSize, inputSize]
outputThe output tensor from the forward pass. Depending on the outputFullSequence parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
outputGradThe gradients of the output. Depending on the outputFullSequence parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]*inputGradThe gradients of the inputs - may be null if this information is not required.
weightsGradA set of weight deltas to sum with weights.
attentionsAttention for each timestep.
[out]attentionsGradGradients for attentions.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The gradient of the initial output.

◆ auGruBwdWithWU() [2/2]

poplar::Tensor popnn::gru::auGruBwdWithWU ( poplar::Graph graph,
const GruParams params,
poplar::program::Sequence prog,
const poplar::Tensor fwdOutputInit,
const poplar::Tensor fwdIntermediates,
const GruWeights weights,
const poplar::Tensor input,
const poplar::Tensor realTimeSteps,
const poplar::Tensor output,
const poplar::Tensor outputGrad,
poplar::Tensor inputGrad,
GruWeights weightsGrad,
const poplar::Tensor attentions,
poplar::Tensor attentionsGrad,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options_,
poplin::PlanningCache *  planningCache 
)

Run a combined AUGRU backward and weight update pass.

Deprecated:
Use previously defined auGruBwdWithWU() instead.

Use this combined backward and weight update pass in preference to auGruBwd() and auGruWU() separately in order to allow the most efficient implementation to be chosen if you do not need to split the operation.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters
graphGraph to which the GRU cell belongs.
paramsThe parameters of the GRU.
progProgram sequence.
fwdOutputInitForward output tensor for initial step.
fwdIntermediatesIntermediates results from the forward pass.
weightsThe GRU weights structure.
inputThe input tensor to the GRU, of shape [timeSteps, batchSize, inputSize].
realTimeStepsA tensor containing real timesteps for each sequence, of shape [batch].
outputThe output tensor from the forward pass. Depending on the outputFullSequence parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
outputGradThe gradients of the output. Depending on the outputFullSequence parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]*inputGradThe gradients of the inputs - may be null if this information is not required.
weightsGradA set of weight deltas to sum with weights.
attentionsAttention for each timestep.
[out]attentionsGradGradients for attentions.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The gradient of the initial output.

◆ auGruFwd() [1/2]

poplar::Tensor popnn::gru::auGruFwd ( poplar::Graph graph,
const GruParams params,
const poplar::Tensor stateInit,
const poplar::Tensor in,
const GruWeights weights,
poplar::Tensor intermediates,
const poplar::Tensor attScores,
poplar::program::Sequence fwdProg,
const poplar::DebugContext debugContext = {},
const poplar::OptionFlags options = {},
poplin::PlanningCache *  planningCache = nullptr 
)

Calculate the result of applying an AUGRU across a sequence.

The formulas for a AUGRU cell are:

  • $r_t = sigmod(w_r \times x_t + u_r \times h_{t-1} + b_r)$
  • $u_t = sigmod(w_u \times x_t + u_u \times h_{t-1} + b_u)$
  • $c_t = tanh(w_c \times x_t + u_c \times (r_t \circ h_{t-1}) + b_c)$
  • $u_t = (1 - a_t) \times u_t$
  • $h_t = u_t \circ h_{t-1} + (1 - u_t) \circ c_t$

Where:

  • $\times$ is matrix multiplication
  • $\circ$ is Hadamard product
  • $a_t$ is a scalar

The AUGRU is run for rnn::RnnParams.maxTimeSteps, each with a batch of size batchSize and input size inputSize and output size outputSize. The total number of units within each AUGRU cell is BASIC_GRU_CELL_NUM_UNITS.

Parameters
graphGraph to which the AUGRU cell belongs.
paramsThe parameters of the AUGRU.
stateInitInitial state for the AUGRU.
inThe input tensor to the AUGRU of dimension [timeSteps, batchSize, inputSize].
weightsThe AUGRU weights structure.
[out]intermediatesIntermediate results that are retained in the forward pass of training for use in the backward pass. It includes the data for reset gate, update gate, candidate, and output if outputFullSequence is false. This argument should be set to null if we are only doing inference.
attScoresAttention for each timestep.
fwdProgProgram sequence.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The output of the GRU. Depending on the outputFullSequence parameter the output tensor is either the output of the last timestep in the shape [batchSize, outputSize] or it is the sequence of outputs for every timestep in the shape [timeSteps, batchSize, outputSize].

◆ auGruFwd() [2/2]

poplar::Tensor popnn::gru::auGruFwd ( poplar::Graph graph,
const GruParams params,
const poplar::Tensor stateInit,
const poplar::Tensor in,
const poplar::Tensor realTimeSteps,
const GruWeights weights,
poplar::Tensor intermediates,
const poplar::Tensor attScores,
poplar::program::Sequence fwdProg,
const poplar::DebugContext debugContext = {},
const poplar::OptionFlags options = {},
poplin::PlanningCache *  planningCache = nullptr 
)

Calculate the result of applying an AUGRU across a sequence.

Deprecated:
Use previously defined auGruFwd() instead.

The formulas for a AUGRU cell are:

  • $r_t = sigmod(w_r \times x_t + u_r \times h_{t-1} + b_r)$
  • $u_t = sigmod(w_u \times x_t + u_u \times h_{t-1} + b_u)$
  • $c_t = tanh(w_c \times x_t + u_c \times (r_t \circ h_{t-1}) + b_c)$
  • $u_t = (1 - a_t) \times u_t$
  • $h_t = u_t \circ h_{t-1} + (1 - u_t) \circ c_t$

Where:

  • $\times$ is matrix multiplication
  • $\circ$ is Hadamard product
  • $a_t$ is a scalar

The AUGRU is run for rnn::RnnParams.maxTimeSteps, each with a batch of size batchSize and input size inputSize and output size outputSize. The total number of units within each AUGRU cell is BASIC_GRU_CELL_NUM_UNITS.

Parameters
graphGraph to which the AUGRU cell belongs.
paramsThe parameters of the AUGRU.
stateInitInitial state for the AUGRU.
inThe input tensor to the AUGRU of dimension [timeSteps, batchSize, inputSize].
realTimeStepsA tensor containing real timesteps for each sequence, of shape [batch].
weightsThe AUGRU weights structure.
[out]intermediatesIntermediate results that are retained in the forward pass of training for use in the backward pass. It includes the data for reset gate, update gate, candidate, and output if outputFullSequence is false. This argument should be set to null if we are only doing inference.
attScoresAttention for each timestep.
fwdProgProgram sequence.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The output of the GRU. Depending on the outputFullSequence parameter the output tensor is either the output of the last timestep in the shape [batchSize, outputSize] or it is the sequence of outputs for every timestep in the shape [timeSteps, batchSize, outputSize].

◆ auGruWU()

GruWeights popnn::gru::auGruWU ( poplar::Graph graph,
const GruParams params,
poplar::program::Sequence prog,
const poplar::Tensor fwdOutputInit,
const poplar::Tensor fwdIntermediates,
const poplar::Tensor bwdIntermediates,
const GruWeights weights,
const poplar::Tensor input,
const poplar::Tensor output,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options_,
poplin::PlanningCache *  planningCache 
)

Run a standalone weight update pass.

Takes intermediates and gradients from the backward pass and calculates and returns weight deltas.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters
graphGraph to which the GRU cell belongs.
paramsThe parameters of the GRU.
progProgram sequence to add operations to.
fwdOutputInitForward output tensor for initial step.
fwdIntermediatesIntermediate results from the forward pass.
bwdIntermediatesIntermediate results from the backward pass.
weightsThe GRU weights structure.
inputThe input tensor to the GRU, of shape [timeSteps, batchSize, inputSize]
outputThe output tensor from the forward pass. Depending on the outputFullSequence parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
A set of weight gradients to sum with weights.

◆ createInput()

poplar::Tensor popnn::gru::createInput ( poplar::Graph graph,
const GruParams params,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options = {},
poplin::PlanningCache *  planningCache = nullptr 
)

Create an input tensor of shape [numSteps, batchSize, inputSize], that is optimally mapped to multiply the whole input sequence in a single matrix multiply operation.

GRU options

  • availableMemoryProportion Decimal between 0 and 1 (inclusive).

    See poplin::createWeights() for more information.

  • inferenceOnly (true, false) [=true]

    Sets convolution pass to INFERENCE_FWD if true; TRAINING_FWD otherwise. See the pass option in poplin::createWeights().

  • partialsType (half, float) [=float]

    See poplin::createWeights() for more information.

Parameters
graphGraph object to add the tensor to.
paramsThe GRU parameters.
debugContextOptional debug information.
optionsAny implementation/debug options for the GRU.
planningCacheA poplin matrix multiply planning cache.
Returns
A tensor created in the graph of shape [timeSteps, batchSize, inputSize].

◆ createWeightsKernel()

std::pair< poplar::Tensor, poplar::Tensor > popnn::gru::createWeightsKernel ( poplar::Graph graph,
const GruParams params,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options = {},
poplin::PlanningCache *  planningCache = nullptr 
)

Create the weights kernel used to weight the input and output of a GRU.

Returns the inputWeights and outputWeights.

◆ getDefaultBasicGruCellOrder()

const std::vector< BasicGruCellUnit > popnn::gru::getDefaultBasicGruCellOrder ( )

Get the default order of the gates in a basic GRU cell.

The default order is: [Reset gate, Update gate, Candidate].

◆ gruBwd() [1/2]

poplar::Tensor popnn::gru::gruBwd ( poplar::Graph graph,
const GruParams params,
poplar::program::Sequence prog,
const poplar::Tensor fwdOutputInit,
const poplar::Tensor fwdIntermediatesSeq,
const GruWeights weights,
const poplar::Tensor fwdInputSeq,
const poplar::Tensor fwdOutput,
const poplar::Tensor gradLayerNext,
poplar::Tensor inputGrad,
poplar::Tensor bwdIntermediates,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options_,
poplin::PlanningCache *  planningCache 
)

Run GRU backward pass.

The backward pass executes in reverse order compared to the forward pass. If the forward steps for a GRU layer are sf = {0, 1, 2, ..., S - 1} then the backward steps run for sb = {S - 1, S - 2, .... , 1, 0}.

Parameters
graphGraph to which the GRU cell belongs.
paramsThe parameters of the GRU.
progProgram sequence.
fwdOutputInitForward output tensor for initial step.
fwdIntermediatesSeqIntermediates results from the forward pass.
weightsThe GRU weights structure.
fwdInputSeqThe input tensor to the GRU, of shape [timeSteps, batchSize, inputSize]
fwdOutputThe output tensor from the forward pass. Depending on the outputFullSequence parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
gradLayerNextThe gradients of the output. Depending on the outputFullSequence parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]*inputGradThe gradients of the inputs - may be null if this information is not required.
[out]*bwdIntermediatesIntermediates gradients that are retained in the backward pass of training for use in the weight update. It includes the derivatives for reset gate, update gate, and candidate. This argument should be set to null if you do not need to calculate weight deltas.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The gradient of the initial output.

◆ gruBwd() [2/2]

poplar::Tensor popnn::gru::gruBwd ( poplar::Graph graph,
const GruParams params,
poplar::program::Sequence prog,
const poplar::Tensor fwdOutputInit,
const poplar::Tensor fwdIntermediatesSeq,
const GruWeights weights,
const poplar::Tensor fwdInputSeq,
const poplar::Tensor realTimeSteps,
const poplar::Tensor fwdOutput,
const poplar::Tensor gradLayerNext,
poplar::Tensor inputGrad,
poplar::Tensor bwdIntermediates,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options_,
poplin::PlanningCache *  planningCache 
)

Run GRU backward pass.

Deprecated:
Use previously defined popnn::gruBwd() instead.

The backward pass executes in reverse order compared to the forward pass. If the forward steps for a GRU layer are sf = {0, 1, 2, ..., S - 1} then the backward steps run for sb = {S - 1, S - 2, .... , 1, 0}.

Parameters
graphGraph to which the GRU cell belongs.
paramsThe parameters of the GRU.
progProgram sequence.
fwdOutputInitForward output tensor for initial step.
fwdIntermediatesSeqIntermediates results from the forward pass.
weightsThe GRU weights structure.
realTimeStepsA tensor containing real timesteps for each sequence, with shape [batch].
fwdInputSeqThe input tensor to the GRU, of shape [timeSteps, batchSize, inputSize]
fwdOutputThe output tensor from the forward pass. Depending on the outputFullSequence parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
gradLayerNextThe gradients of the output. Depending on the outputFullSequence parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]*inputGradThe gradients of the inputs - may be null if this information is not required.
[out]*bwdIntermediatesIntermediates gradients that are retained in the backward pass of training for use in the weight update. It includes the derivatives for reset gate, update gate, and candidate. This argument should be set to null if you do not need to calculate weight deltas.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The gradient of the initial output.

◆ gruBwdWithWU() [1/2]

poplar::Tensor popnn::gru::gruBwdWithWU ( poplar::Graph graph,
const GruParams params,
poplar::program::Sequence prog,
const poplar::Tensor fwdOutputInit,
const poplar::Tensor fwdIntermediates,
const GruWeights weights,
const poplar::Tensor input,
const poplar::Tensor output,
const poplar::Tensor outputGrad,
poplar::Tensor inputGrad,
GruWeights weightsGrad,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options_,
poplin::PlanningCache *  planningCache 
)

Run a combined GRU backward and weight update pass.

Use this combined backward and weight update pass in preference to gruBwd() and gruWU() separately in order to allow the most efficient implementation to be chosen if you do not need to split the operation.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters
graphGraph to which the GRU cell belongs.
paramsThe parameters of the GRU.
progProgram sequence.
fwdOutputInitForward output tensor for initial step.
fwdIntermediatesIntermediates results from the forward pass.
weightsThe GRU weights structure.
inputThe input tensor to the GRU, of shape [timeSteps, batchSize, inputSize]
outputThe output tensor from the forward pass. Depending on the outputFullSequence parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
outputGradThe gradients of the output. Depending on the outputFullSequence parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]*inputGradThe gradients of the inputs - may be null if this information is not required.
weightsGradA set of weight deltas to sum with weights.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The gradient of the initial output.

◆ gruBwdWithWU() [2/2]

poplar::Tensor popnn::gru::gruBwdWithWU ( poplar::Graph graph,
const GruParams params,
poplar::program::Sequence prog,
const poplar::Tensor fwdOutputInit,
const poplar::Tensor fwdIntermediates,
const GruWeights weights,
const poplar::Tensor input,
const poplar::Tensor realTimeSteps,
const poplar::Tensor output,
const poplar::Tensor outputGrad,
poplar::Tensor inputGrad,
GruWeights weightsGrad,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options_,
poplin::PlanningCache *  planningCache 
)

Run a combined GRU backward and weight update pass.

Deprecated:
Use previously defined gruBwdWithWU() instead.

Use this combined backward and weight update pass in preference to gruBwd() and gruWU() separately in order to allow the most efficient implementation to be chosen if you do not need to split the operation.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters
graphGraph to which the GRU cell belongs.
paramsThe parameters of the GRU.
progProgram sequence.
fwdOutputInitForward output tensor for initial step.
fwdIntermediatesIntermediates results from the forward pass.
weightsThe GRU weights structure.
inputThe input tensor to the GRU, of shape [timeSteps, batchSize, inputSize]
realTimeStepsA tensor containing real timesteps for each sequence, of shape [batch].
outputThe output tensor from the forward pass. Depending on the outputFullSequence parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
outputGradThe gradients of the output. Depending on the outputFullSequence parameter this is either the gradient of the output for the last timestep or it is a sequence output gradients for each timestep.
[out]*inputGradThe gradients of the inputs - may be null if this information is not required.
weightsGradA set of weight deltas to sum with weights.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The gradient of the initial output.

◆ gruFwd() [1/2]

poplar::Tensor popnn::gru::gruFwd ( poplar::Graph graph,
const GruParams params,
const poplar::Tensor stateInit,
const poplar::Tensor in,
const GruWeights weights,
poplar::Tensor intermediates,
poplar::program::Sequence fwdProg,
const poplar::DebugContext debugContext = {},
const poplar::OptionFlags options = {},
poplin::PlanningCache *  planningCache = nullptr 
)

Calculate the result of applying a GRU across a sequence.

The formulas for a GRU cell are:

  • $r_t = \operatorname{sigmoid}(w_r \times x_t + u_r \times h_{t-1} + b_r)$
  • $u_t = \operatorname{sigmoid}(w_u \times x_t + u_u \times h_{t-1} + b_u)$
  • $c_t = \tanh(w_c \times x_t + u_c \times (r_t \circ h_{t-1}) + b_c)$
  • $h_t = u_t \circ h_{t-1} + (1 - u_t) \circ c_t$

Where:

  • $\times$ is matrix multiplication
  • $\circ$ is Hadamard product

The GRU is run for rnn::RnnParams.maxTimeSteps, each with a batch of size batchSize and input size inputSize and output size outputSize. The total number of units within each GRU cell is BASIC_GRU_CELL_NUM_UNITS.

Parameters
graphGraph to which the GRU cell belongs.
paramsThe parameters of the GRU.
stateInitInitial state for the GRU.
inThe input tensor to the GRU of dimension [timeSteps, batchSize, inputSize].
weightsThe GRU weights structure.
[out]intermediatesIntermediate results that are retained in the forward pass of training for use in the backward pass. It includes the data for reset gate, update gate, candidate, and output if outputFullSequence is false. This argument should be set to null if we are only doing inference.
fwdProgProgram sequence.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The output of the GRU. Depending on the outputFullSequence parameter the output tensor is either the output of the last timestep in the shape [batchSize, outputSize] or it is the sequence of outputs for every timestep in the shape [timeSteps, batchSize, outputSize].

◆ gruFwd() [2/2]

poplar::Tensor popnn::gru::gruFwd ( poplar::Graph graph,
const GruParams params,
const poplar::Tensor stateInit,
const poplar::Tensor in,
const poplar::Tensor realTimeSteps,
const GruWeights weights,
poplar::Tensor intermediates,
poplar::program::Sequence fwdProg,
const poplar::DebugContext debugContext = {},
const poplar::OptionFlags options = {},
poplin::PlanningCache *  planningCache = nullptr 
)

Calculate the result of applying a GRU across a sequence.

Deprecated:
Use previously defined gruFwd() instead.

The formulas for a GRU cell are:

  • $r_t = \operatorname{sigmoid}(w_r \times x_t + u_r \times h_{t-1} + b_r)$
  • $u_t = \operatorname{sigmoid}(w_u \times x_t + u_u \times h_{t-1} + b_u)$
  • $c_t = \tanh(w_c \times x_t + u_c \times (r_t \circ h_{t-1}) + b_c)$
  • $h_t = u_t \circ h_{t-1} + (1 - u_t) \circ c_t$

Where:

  • $\times$ is matrix multiplication
  • $\circ$ is Hadamard product

The GRU is run for rnn::RnnParams.maxTimeSteps, each with a batch of size batchSize and input size inputSize and output size outputSize. The total number of units within each GRU cell is BASIC_GRU_CELL_NUM_UNITS.

Parameters
graphGraph to which the GRU cell belongs.
paramsThe parameters of the GRU.
stateInitInitial state for the GRU.
inThe input tensor to the GRU of dimension [timeSteps, batchSize, inputSize].
realTimeStepsA tensor containing real timesteps for each sequence, of shape [batch].
weightsThe GRU weights structure.
[out]intermediatesIntermediate results that are retained in the forward pass of training for use in the backward pass. It includes the data for reset gate, update gate, candidate, and output if outputFullSequence is false. This argument should be set to null if we are only doing inference.
fwdProgProgram sequence.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
The output of the GRU. Depending on the outputFullSequence parameter the output tensor is either the output of the last timestep in the shape [batchSize, outputSize] or it is the sequence of outputs for every timestep in the shape [timeSteps, batchSize, outputSize].

◆ gruWU()

GruWeights popnn::gru::gruWU ( poplar::Graph graph,
const GruParams params,
poplar::program::Sequence prog,
const poplar::Tensor fwdOutputInit,
const poplar::Tensor fwdIntermediates,
const poplar::Tensor bwdIntermediates,
const GruWeights weights,
const poplar::Tensor input,
const poplar::Tensor output,
const poplar::DebugContext debugContext,
const poplar::OptionFlags options_,
poplin::PlanningCache *  planningCache 
)

Run a standalone weight update pass.

Takes intermediates and gradients from the backward pass and calculates and returns weight deltas.

Note: If the timestep limit is variable, the entries above the given time step limit must be explicitly set to zero in fwdIntermediates, in order for the weights to be correctly updated.

Parameters
graphGraph to which the GRU cell belongs.
paramsThe parameters of the GRU.
progProgram sequence to add operations to.
fwdOutputInitForward output tensor for initial step.
fwdIntermediatesIntermediate results from the forward pass.
bwdIntermediatesIntermediate results from the backward pass.
weightsThe GRU weights structure.
inputThe input tensor to the GRU, of shape [timeSteps, batchSize, inputSize]
outputThe output tensor from the forward pass. Depending on the outputFullSequence parameter this is either the output for the last timestep or it is a sequence of outputs for each timestep.
debugContextOptional debug information.
optionsGRU implementation options. See createInput().
planningCacheThe matmul planning cache.
Returns
A set of weight gradients to sum with weights.