1. Introduction

Model Runtime is a library built upon Poplar Runtime to easily load and run models stored in the Poplar Exchange Format (PopEF) on the IPU. It consists of various tools for deploying models, dedicated for a wide range of inference applications for example inference serving backends. The Model Runtime library is logically divided into two parts:

  • The high-level API allows for quick model deployment and requires minimal knowledge about the Poplar SDK libraries and the IPU hardware.

  • The low-level API is aimed at advanced users and, allows more flexibility than the high-level API. It requires knowledge about PopEF, Poplar Runtime and the IPU hardware.

The Model Runtime library supports C++ and Python.

This chapter introduces the fundamental concepts of the Model Runtime library through self-contained examples.

Note

Files that are included by the examples in this chapter are placed in the Section 9, Appendix. They contain helper functions, for example processing command line arguments.

This C++ high-level API example loads the model from PopEF files and sends one inference request to the IPU.

Listing 1.1 model_runner_quick_start.cpp
 1// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 2#include <string>
 3#include <vector>
 4
 5#include <boost/program_options.hpp>
 6
 7#include "model_runtime/ModelRunner.hpp"
 8#include "model_runtime/Tensor.hpp"
 9#include "utils.hpp"
10
11/* The example shows loading a model from PopEF files and sending a single
12 * inference request.
13 */
14int main(int argc, char *argv[]) {
15  using namespace std::chrono_literals;
16  static const char *example_desc = "Model runner simple example.";
17  const boost::program_options::variables_map vm =
18      examples::parsePopefProgramOptions(example_desc, argc, argv);
19  const auto popef_paths = vm["popef"].as<std::vector<std::string>>();
20
21  model_runtime::ModelRunnerConfig config;
22  config.device_wait_config =
23      model_runtime::DeviceWaitConfig{600s /*timeout*/, 1s /*sleep_time*/};
24  model_runtime::ModelRunner model_runner(popef_paths, config);
25
26  examples::print("Allocating input tensors");
27
28  const model_runtime::InputMemory input_memory =
29      examples::allocateHostInputData(model_runner.getExecuteInputs());
30
31  examples::printInputMemory(input_memory);
32
33  examples::print("Sending single synchronous request with empty data.");
34
35  const model_runtime::OutputMemory output_memory =
36      model_runner.execute(examples::toInputMemoryView(input_memory));
37
38  examples::print("Received output:");
39
40  using OutputValueType =
41      std::pair<const std::string, model_runtime::TensorMemory>;
42
43  for (const OutputValueType &name_with_memory : output_memory) {
44    auto &&[name, memory] = name_with_memory;
45    examples::print(fmt::format("Output tensor {}, {} bytes", name,
46                                memory.data_size_bytes));
47  }
48
49  examples::print("Success: exiting");
50  return EXIT_SUCCESS;
51}

Download model_runner_quick_start.cpp

This Python high-level API example loads the model from PopEF files and sends one inference request to the IPU.

Listing 1.2 model_runner_quick_start.py
 1#!/usr/bin/env python3
 2# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 3
 4import argparse
 5from datetime import timedelta
 6import numpy as np
 7import model_runtime
 8import popef
 9"""
10The example shows loading a model from PopEF files and sending a single
11inference request.
12"""
13
14
15def main():
16    parser = argparse.ArgumentParser("Model runner simple example.")
17    parser.add_argument(
18        "-p",
19        "--popef",
20        type=str,
21        metavar='popef_file_path',
22        help="A collection of PopEF files containing the model.",
23        nargs='+',
24        required=True)
25    args = parser.parse_args()
26
27    # Create model runner
28    config = model_runtime.ModelRunnerConfig()
29    config.device_wait_config = model_runtime.DeviceWaitConfig(
30        model_runtime.DeviceWaitStrategy.WAIT_WITH_TIMEOUT,
31        timeout=timedelta(seconds=600),
32        sleepTime=timedelta(seconds=1))
33
34    print("Creating ModelRunner with", config)
35    runner = model_runtime.ModelRunner(model_runtime.PopefPaths(args.popef),
36                                       config=config)
37
38    print("Preparing input tensors:")
39    input_descriptions = runner.getExecuteInputs()
40    input_tensors = [
41        np.random.randn(*input_desc.shape).astype(input_desc.numpy_data_type())
42        for input_desc in input_descriptions
43    ]
44    input_view = model_runtime.InputMemoryView()
45
46    for input_desc, input_tensor in zip(input_descriptions, input_tensors):
47        print("\tname:", input_desc.name, "shape:", input_tensor.shape,
48              "dtype:", input_tensor.dtype)
49        input_view[input_desc.name] = input_tensor
50
51    print("Sending single synchronous request with empty data.")
52    result = runner.execute(input_view)
53    output_descriptions = runner.getExecuteOutputs()
54
55    print("Processing output tensors:")
56    for output_desc in output_descriptions:
57        output_tensor = np.frombuffer(
58            result[output_desc.name],
59            dtype=output_desc.numpy_data_type()).reshape(output_desc.shape)
60        print("\tname:", output_desc.name, "shape:", output_tensor.shape,
61              "dtype:", output_tensor.dtype, "\n", output_tensor)
62
63    print("Success: exiting")
64    return 0
65
66
67if __name__ == "__main__":
68    main()

Download model_runner_quick_start.py

This C++ low-level API example loads the model from PopEF files and sends one inference request to the IPU.

Listing 1.3 session_quick_start.cpp
  1// Copyright (c) 2022 Graphcore Ltd. All rights reserved.
  2#include <string>
  3#include <vector>
  4
  5#include <boost/program_options.hpp>
  6
  7#include <popef/Model.hpp>
  8#include <popef/Reader.hpp>
  9#include <popef/Types.hpp>
 10
 11#include "model_runtime/DeviceManager.hpp"
 12#include "model_runtime/Session.hpp"
 13
 14#include "utils.hpp"
 15
 16namespace examples {
 17
 18std::shared_ptr<popef::Model>
 19createPopefModel(const std::vector<std::string> &popef_paths);
 20
 21} // namespace examples
 22
 23/* The example shows loading a model from PopEF files and sending a single
 24 * inference request.
 25 */
 26int main(int argc, char *argv[]) {
 27
 28  using namespace std::chrono_literals;
 29  static const char *example_desc = "Session quick start example.";
 30  const boost::program_options::variables_map vm =
 31      examples::parsePopefProgramOptions(example_desc, argc, argv);
 32  const auto popef_paths = vm["popef"].as<std::vector<std::string>>();
 33
 34  std::shared_ptr<popef::Model> model = examples::createPopefModel(popef_paths);
 35
 36  model_runtime::Session session{model};
 37  examples::print("Created Session.");
 38
 39  model_runtime::DeviceManager dm;
 40  model_runtime::DeviceWaitConfig wait_config{600s /*timeout*/,
 41                                              1s /*sleep_time*/};
 42
 43  std::shared_ptr<model_runtime::Device> device =
 44      dm.getDevice(model, wait_config);
 45  examples::print(fmt::format("Device acquired: {}", *device));
 46
 47  session.bindToDevice(device);
 48
 49  model_runtime::QueueManager *queue_manager = session.createQueueManager();
 50  examples::print("Created QueueManager.");
 51
 52  using byte_t = unsigned char;
 53  using memory_t = std::vector<byte_t>;
 54
 55  std::vector<memory_t> allocated_memory;
 56
 57  for (const popef::Anchor *input_anchor : session.getUserInputAnchors()) {
 58    const std::string anchor_name = input_anchor->name();
 59    const std::size_t size_in_bytes = input_anchor->tensorInfo().sizeInBytes();
 60
 61    examples::print(
 62        fmt::format("Allocating input memory {} bytes for input anchor `{}`",
 63                    size_in_bytes, anchor_name));
 64    memory_t &memory = allocated_memory.emplace_back(size_in_bytes);
 65
 66    examples::print(
 67        fmt::format("Queue Manager - enqueue input anchor `{}`", anchor_name));
 68    queue_manager->inputQueue(anchor_name)
 69        .enqueue(memory.data(), size_in_bytes);
 70  }
 71
 72  for (const popef::Anchor *output_anchor : session.getUserOutputAnchors()) {
 73    const std::string anchor_name = output_anchor->name();
 74    const std::size_t size_in_bytes = output_anchor->tensorInfo().sizeInBytes();
 75
 76    examples::print(
 77        fmt::format("Allocating output memory {} bytes for output anchor `{}`",
 78                    size_in_bytes, anchor_name));
 79    memory_t &memory = allocated_memory.emplace_back(size_in_bytes);
 80    examples::print(fmt::format("{}", memory.size()));
 81    examples::print(
 82        fmt::format("Queue Manager - enqueue output anchor `{}`", anchor_name));
 83    queue_manager->outputQueue(anchor_name)
 84        .enqueue(memory.data(), size_in_bytes);
 85  }
 86
 87  examples::print("Session started model execution.");
 88  session.runLoadPrograms();
 89  session.runMainPrograms();
 90  examples::print("Session finished model execution.");
 91
 92  examples::print("Success: exiting");
 93  return EXIT_SUCCESS;
 94}
 95
 96namespace examples {
 97
 98std::shared_ptr<popef::Model>
 99createPopefModel(const std::vector<std::string> &popef_paths) {
100  auto reader = std::make_shared<popef::Reader>();
101  for (const auto &path : popef_paths)
102    reader->parseFile(path);
103
104  return popef::ModelBuilder(reader).createModel();
105}
106
107} // namespace examples

Download session_quick_start.cpp

This Python low-level API example loads the model from PopEF files and sends one inference request to the IPU.

Listing 1.4 session_quick_start.py
 1#!/usr/bin/env python3
 2# Copyright (c) 2022 Graphcore Ltd. All rights reserved.
 3
 4import argparse
 5from datetime import timedelta
 6import numpy as np
 7import model_runtime
 8import popef
 9"""
10The example shows loading a model from PopEF files and sending a single
11inference request.
12"""
13
14
15def main():
16    parser = argparse.ArgumentParser("Model runner simple example.")
17    parser.add_argument(
18        "-p",
19        "--popef",
20        type=str,
21        metavar='popef_file_path',
22        help="A collection of PopEF files containing the model.",
23        nargs='+',
24        required=True)
25    args = parser.parse_args()
26
27    for model_file in args.popef:
28        reader = popef.Reader()
29        reader.parseFile(model_file)
30
31    model = popef.ModelBuilder(reader).createModel()
32
33    dm = model_runtime.DeviceManager()
34    wait_config = model_runtime.DeviceWaitConfig(
35        model_runtime.DeviceWaitStrategy.WAIT_WITH_TIMEOUT,
36        timeout=timedelta(seconds=600),
37        sleepTime=timedelta(seconds=1))
38    device = dm.getDevice(model, wait_config=wait_config)
39    print("Device acquired:", device)
40
41    session = model_runtime.Session(model)
42    print("Created Session.")
43
44    session.bindToDevice(device)
45
46    queue_manager = session.createQueueManager()
47    print("Created QueueManager.")
48
49    user_input_anchors = session.getUserInputAnchors()
50    input_data = prepare_io_memory(user_input_anchors)
51
52    for anchor, tensor in zip(user_input_anchors, input_data):
53        anchor_name = anchor.name()
54        print("Enqueue input anchor ", anchor_name)
55        queue_manager.inputs[anchor_name].enqueue(tensor)
56
57    user_output_anchors = session.getUserOutputAnchors()
58    output_data = prepare_io_memory(user_input_anchors)
59
60    for anchor, tensor in zip(user_output_anchors, output_data):
61        anchor_name = anchor.name()
62        print("Enqueue output anchor ", anchor_name)
63        queue_manager.outputs[anchor_name].enqueue(tensor)
64
65    print("Running load programs")
66    session.runLoadPrograms()
67
68    print("Running main programs")
69    session.runMainPrograms()
70
71    print("Execution finished. Results available.")
72
73    print("Success: exiting")
74    return 0
75
76
77def prepare_io_memory(anchors):
78    data = []
79    for anchor in anchors:
80        info = anchor.tensorInfo()
81        shape = info.shape()
82        data.append(np.random.randn(*shape).astype(info.numpyDType()))
83    return data
84
85
86if __name__ == "__main__":
87    main()

Download session_quick_start.py