import tensorflow as tf
import os
import numpy as np
import time
import logging
# Import IPU API
from tensorflow.python import ipu
from tensorflow_core.python.keras.backend import unique_object_name
from typing import Any, Optional, Tuple, Union
def create_input_data(batch_size=1, height=224, width=224, channels=4):
"""
Create the input dataset for in-feeds
:param batch_size: size of the batches to process
:param height: height of input image
:param width: width of input image
:param channels: channels (RGB) in the input image
:return: Constructed dataset
"""
# Synthetic input data follows NHWC format
input_data = np.random.random((batch_size, height, width, channels))
input_data = tf.cast(input_data, DTYPE)
# Prepare dataset for ipu_infeeds
ds = tf.data.Dataset \
.range(1) \
.map(lambda k: {"features": input_data}) \
.repeat() \
.prefetch(BATCHES_PER_STEP)
return ds
def conv(input_tensor: tf.Tensor,
kernel_size: Union[int, Tuple[int, int]],
filters_out: int,
stride: Optional[int] = 1,
padding: Optional[str] = 'SAME',
add_bias: Optional[bool] = True,
dtype: Optional[Any] = tf.float16,
name: Optional[str] = None,
weight_suffix: Optional[str] = "kernel",
bias_suffix: Optional[str] = "conv/bias",
*_):
"""
Apply convolutional layer and optional bias on input tensor.
Args:
input_tensor: Input data
kernel_size: Filter size (assumes equal height and width)
filters_out: Number of output filters
stride: Stride of the filter
padding: Type of padding to use
add_bias: Should bias be added
dtype: Data type of parameters
name: Optional name for this op
weight_suffix: String to weight name with
bias_suffix: String to suffix the bias name with
Returns: Output of convolution operator.
"""
# Assumes input in NHWC format.
filters_in = input_tensor.get_shape()[-1]
if isinstance(kernel_size, int):
w_shape = [kernel_size, kernel_size, filters_in, filters_out]
else:
w_shape = kernel_size + (filters_in, filters_out)
w_init = tf.contrib.layers.xavier_initializer(dtype=dtype)
if name is None:
name = unique_object_name("conv2d", zero_based=True)
name_scope = tf.get_default_graph().get_name_scope()
if name_scope not in ["", None]:
name = name_scope + "/" + name
with tf.get_default_graph().as_default():
with tf.variable_scope(name):
weights = tf.get_variable(weight_suffix,
shape=w_shape,
initializer=w_init,
dtype=dtype)
output_tensor = tf.nn.conv2d(input_tensor,
weights, [1, stride, stride, 1],
padding=padding.upper(),
name=name)
if add_bias:
b_shape = [filters_out]
b_init = tf.zeros_initializer()
with tf.variable_scope(name):
biases = tf.get_variable(bias_suffix,
shape=b_shape,
initializer=b_init,
dtype=dtype)
output_tensor += biases
return output_tensor
# Block definitions for ResNeXt
def input_block(x):
x = conv(x, kernel_size=7, stride=2, filters_out=64, name="conv1")
x = norm(x, training=False)
x = relu(x)
x = maxpool(x, size=3, stride=2)
return x
def maxpool(x, size=3, stride=2):
x = tf.nn.max_pool(x,
ksize=[1, size, size, 1],
strides=[1, stride, stride, 1],
padding='SAME')
return x
def reduce_mean(x, indices=(1, 2)):
x = tf.reduce_mean(x, reduction_indices=indices)
return x
def fc(x, num_units_out):
num_units_in = x.get_shape()[1]
w_init = tf.contrib.layers.xavier_initializer(dtype=tf.float16)
b_init = tf.constant_initializer(0.0)
weights = tf.get_variable('weights',
shape=[num_units_in, num_units_out],
initializer=w_init,
dtype=tf.float16)
biases = tf.get_variable('biases',
shape=[num_units_out],
initializer=b_init,
dtype=tf.float16)
x = tf.nn.xw_plus_b(x, weights, biases)
return x
def norm(x, training=False):
x = tf.layers.batch_normalization(x,
fused=True,
center=True,
scale=True,
training=training,
trainable=training,
momentum=0.997,
epsilon=1e-5)
return x
def relu(x):
return tf.nn.relu(x)
def group_conv(x,
ksize,
stride,
filters_in,
filters_out,
index=0,
groups=1,
dtype=tf.float16,
name='conv'):
"""
Apply group convolutions by leveraging XLA implementation.
"""
with tf.variable_scope(name, use_resource=True):
W = tf.get_variable(
"conv2d/kernel" + str(index),
shape=[ksize, ksize, filters_in.value / groups, filters_out],
dtype=dtype,
trainable=True,
initializer=tf.variance_scaling_initializer())
return tf.nn.conv2d(x,
filters=W,
strides=[1, stride, stride, 1],
padding='SAME')
def group_conv_block(x, first_stride, filters, count, name='', cardinality=4):
"""
Group convolution block implementation.
:param x: Input tensor
:param first_stride: Initial tensor
:param filters: List of number of filters for various convolution blocks
:param count: Number of times block is repeated
:param name: Name of block
:param cardinality: Number of groups = outputchannels/cardinality
:return: Layer x after application of all the ops within the block
"""
for i in range(count):
shortcut = x
stride = (first_stride if (i == 0) else 1)
# First vanilla convolution
x = conv(x,
kernel_size=1,
stride=stride,
filters_out=filters[0],
add_bias=False,
name=name + str(i) + "_1",
dtype=tf.float16)
x = norm(x)
x = relu(x)
# Group convolution evaluation
x = group_conv(x,
ksize=3,
stride=1,
filters_in=x.get_shape()[-1],
filters_out=filters[0],
index=1,
name=name + str(i) + "_2",
groups=cardinality,
dtype=tf.float16)
x = norm(x)
x = relu(x)
# Second vanilla convolution
x = conv(x,
kernel_size=1,
stride=1,
filters_out=filters[1],
add_bias=False,
name=name + str(i) + "_3",
dtype=tf.float16)
x = norm(x)
if i == 0:
shortcut = conv(shortcut,
kernel_size=1,
stride=stride,
filters_out=filters[1],
add_bias=False,
name=name + str(i) + "skip",
dtype=tf.float16)
shortcut = norm(shortcut)
x = shortcut + x
x = relu(x)
return x
def resnext101_model():
"""
Define ResNext-101 network graph
"""
def body(features):
with tf.variable_scope("VanillaResNeXt"):
x = input_block(features)
x = group_conv_block(x,
first_stride=1,
filters=[128, 256],
count=3,
cardinality=CARDINALITY,
name='res2_') # 112
x = group_conv_block(x,
first_stride=2,
filters=[256, 512],
count=4,
cardinality=CARDINALITY,
name='res3_') # 224
x = group_conv_block(x,
first_stride=2,
filters=[512, 1024],
count=23,
cardinality=CARDINALITY,
name='res4_') # 448
x = group_conv_block(x,
first_stride=2,
filters=[1024, 2048],
count=3,
cardinality=CARDINALITY,
name='res5_') # 896
x = reduce_mean(x)
output = fc(x, num_units_out=1000)
outfeed = outfeed_queue.enqueue(output)
return outfeed
return tf.python.ipu.loops.repeat(n=BATCHES_PER_STEP,
body=body,
infeed_queue=infeed_queue)
if __name__ == '__main__':
print("ResNeXt-101 Inference")
IPU_MODEL = False
# Number of steps
NUM_ITERATIONS = 5
BATCHES_PER_STEP = 1000
# Model
MODEL = 'ResNeXt-101'
CARDINALITY = 32
BATCH_SIZE = 4
# Precision
DTYPE = tf.float16
# Create input data using randomized numpy arrays
dataset = create_input_data(batch_size=BATCH_SIZE,
height=224,
width=224,
channels=4)
if IPU_MODEL:
os.environ['TF_POPLAR_FLAGS'] = "--use_ipu_model"
# Setup infeed queue
if BATCHES_PER_STEP > 1:
with tf.device('cpu'):
infeed_queue = ipu.ipu_infeed_queue.IPUInfeedQueue(
dataset)
else:
raise NotImplementedError("batches per step == 1 not implemented yet.")
# Setup outfeed
outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue()
# Compiles graph and targets IPU(s)
with ipu.scopes.ipu_scope('/device:IPU:0'):
res = ipu.ipu_compiler.compile(resnext101_model, inputs=[])
# Setup IPU configuration and build session
cfg = ipu.config.IPUConfig()
cfg.convolutions.poplar_options["availableMemoryProportion"] = "0.3"
cfg.ipu_model.compile_ipu_code = False
cfg.auto_select_ipus = 1
cfg.configure_ipu_system()
ipu.utils.move_variable_initialization_to_cpu()
outfeed = outfeed_queue.dequeue()
with tf.Session() as sess:
fps = []
latency = []
sess.run(infeed_queue.initializer)
sess.run(tf.global_variables_initializer())
# Warm up
print("Compiling and Warmup...")
start = time.time()
sess.run(res)
outfed = sess.run(outfeed)
duration = time.time() - start
print("Duration: {:.3f} seconds\n".format(duration))
for iter_count in range(NUM_ITERATIONS):
print("Running iteration: ", iter_count)
# Run
start = time.time()
sess.run(res)
sess.run(outfeed)
stop = time.time()
fps.append((BATCHES_PER_STEP * BATCH_SIZE) / (stop - start))
logging.info(
"Iter {3}: {0} Throughput using real data = {1:.1f}"
" imgs/sec at batch size = {2}".format(
str(MODEL), fps[-1], BATCH_SIZE, iter_count))
latency.append(1000 * (stop - start) / BATCHES_PER_STEP)
logging.info(
"Iter {3}: {0} Latency using real data = {2:.2f} msecs "
"at batch_size = {1}".format(str(MODEL), BATCH_SIZE,
latency[-1], iter_count))
print("Average statistics over {0} iterations, excluding the 1st "
"iteration.".format(NUM_ITERATIONS))
print("-------------------------")
fps = fps[1:]
latency = latency[1:]
print(
"Throughput at bs={} of {}: min={}, max={}, mean={}, std={}.".
format(BATCH_SIZE, str(MODEL), min(fps), max(fps),
np.mean(fps), np.std(fps)))
print("Latency at bs={} of {}: min={}, max={}, mean={}, std={}.".
format(BATCH_SIZE, str(MODEL), min(latency), max(latency),
np.mean(latency), np.std(latency)))