7. Examples

7.1. Listing running applications

This example provides a list of the user processes currently running on IPUs within the system.

Example output:

$ gc-active-apps
Graphcore active apps:  1
 o application #0: [gc-powertest]

7.1.1. Python

 1import sys
 2import gcipuinfo
 3
 4discovery_mode = gcipuinfo.DiscoverActivePartitionIPUs
 5if len(sys.argv) > 1:
 6  if sys.argv[1] == "--all-partitions":
 7    discovery_mode = gcipuinfo.DiscoverAllPartitionIPUs
 8  else:
 9    print("Error, unrecognised option.")
10    print("Specify --all-partitions to show IPUs in all partitions")
11    sys.exit(1)
12
13inventory = gcipuinfo.gcipuinfo(discovery_mode)
14apps = inventory.getNamedAttributeForAll(gcipuinfo.UserExecutable)
15
16print("Graphcore active apps: " + str(len(apps)))
17for index, exe_name in enumerate(apps):
18  if exe_name:
19    print(" o application #{}: [{}]".format(index, exe_name))

gc_active_apps.py

7.1.2. C++

 1#include <iostream>
 2#include <string.h>
 3
 4#include "graphcore_target_access/gcipuinfo/IPUAttributeLabels.h"
 5#include "graphcore_target_access/gcipuinfo/gcipuinfo.h"
 6
 7int main(int argc, char *argv[]) {
 8
 9  DeviceDiscoveryMode discoveryMode = DiscoverActivePartitionIPUs;
10  if (argc > 1) {
11    if (strcmp(argv[1], "--all-partitions") == 0) {
12      discoveryMode = DiscoverAllPartitionIPUs;
13    } else {
14      std::cerr << "Error, unrecognised option.\n";
15      std::cerr << "Specify --all-partitions to show IPUs in all partitions\n";
16      std::exit(1);
17    }
18  }
19  gcipuinfo inventory(discoveryMode);
20
21  auto apps =
22      inventory.getNamedAttributeForAll(IPUAttributeLabels::UserExecutable);
23
24  std::cout << "Graphcore active apps: " << apps.size() << "\n";
25  unsigned index = 0;
26  for (auto &exeName : apps) {
27
28    if (exeName.size()) {
29      std::cout << " o application #" << index++ << ": [" << exeName << "]\n";
30    }
31    index++;
32  }
33
34  return 0;
35}

gc_active_apps.cpp

7.1.3. Go

 1package main
 2
 3import (
 4	"os"
 5	"fmt"
 6	"gcipuinfo"
 7)
 8
 9func main() {
10
11	discoveryMode := gcipuinfo.DiscoverActivePartitionIPUs
12	if len(os.Args) > 1 {
13		if os.Args[1] == "--all-partitions" {
14			discoveryMode = gcipuinfo.DiscoverAllPartitionIPUs
15		} else {
16			fmt.Println("Error, unrecognised option.")
17			fmt.Println("Specify --all-partitions to show IPUs in all partitions")
18			os.Exit(1)
19		}
20	}
21
22	inventory := gcipuinfo.NewGcipuinfo(discoveryMode)
23	var exeNames []string = inventory.GetNamedAttributeForAll(gcipuinfo.UserExecutable)
24	fmt.Println("Graphcore active apps: ", len(exeNames))
25	for index, exeName := range exeNames {
26			if len(exeName) > 0 {
27				fmt.Printf(" o application #%d: [%s]\n", index, exeName)
28			}
29	}
30}

gc_active_apps.go

7.2. Listing a collection of attributes across all IPUs in the system

This example loops over the IPUs in the system and displays a subset of the attributes for each device. It runs forever, polling for updated attribute values. By default, only IPUs in the currently active partition or GCD are displayed. If you specify the --all-partitions flag, the gcipuinfo object is configured with the DiscoverAllPartitionIPUs option, which will retrieve information for IPUs in all known partitions.

Example output:

$ gc-get-attributes-for-all-devices
*** Iteration 0 ***

Device 0
  user process id : 62931
  user executable : ./example
  user name : ipuuser
  board ipu index : 3
  board serial number : 0026.0002.8203321
  clock : 1300MHz
  total board power : 46.0 C
  average board temp : 41.3 C
  ipu utilisation : 100.00%
  max active code size (bytes) : 36420
  max active data size (bytes) : 318
  max active stack size (bytes) : 1344

Device 1
  (device not in use by any known process)
  board ipu index : 2
  board serial number : 0026.0002.8203321
  clock : 1300MHz
  total board power : N/A
  average board temp : N/A
  ipu utilisation : 0.00%

Device 2
  (device not in use by any known process)
  board ipu index : 1
  board serial number : 0026.0001.8203321
  clock : 1300MHz
  total board power : N/A
  average board temp : N/A
  ipu utilisation : 0.00%

7.2.1. C++

 1#include <iostream>
 2#include <string.h>
 3#include <unistd.h>
 4
 5#include "graphcore_target_access/gcipuinfo/IPUAttributeLabels.h"
 6#include "graphcore_target_access/gcipuinfo/gcipuinfo.h"
 7
 8void printAttribute(const std::map<std::string, std::string> &map,
 9                    const std::string &key) {
10  if (map.count(key) == 0)
11    return;
12  std::cout << "  " << key << " : " << map.at(key) << "\n";
13}
14
15int main(int argc, char *argv[]) {
16
17  DeviceDiscoveryMode discoveryMode = DiscoverActivePartitionIPUs;
18  if (argc > 1) {
19    if (strcmp(argv[1], "--all-partitions") == 0) {
20      discoveryMode = DiscoverAllPartitionIPUs;
21    } else {
22      std::cerr << "Error, unrecognised option.\n";
23      std::cerr << "Specify --all-partitions to show IPUs in all partitions\n";
24      std::exit(1);
25    }
26  }
27  gcipuinfo inventory(discoveryMode);
28
29  std::cout << "Devices:\n";
30
31  unsigned count = 0;
32  while (1) {
33    std::cout << "*** Iteration " << count++ << " ***\n\n";
34
35    inventory.updateData(); // Refresh the device attribute values
36
37    auto deviceMaps = inventory.getDevices();
38    for (auto deviceMap : deviceMaps) {
39      std::cout << "Device " << deviceMap.at(IPUAttributeLabels::DeviceId)
40                << "\n";
41
42      // Only devices that are in use will have a process id associated with
43      // them
44      if (deviceMap.count(IPUAttributeLabels::UserProcessId)) {
45        printAttribute(deviceMap, IPUAttributeLabels::UserProcessId);
46        printAttribute(deviceMap, IPUAttributeLabels::UserExecutable);
47        printAttribute(deviceMap, IPUAttributeLabels::UserName);
48      } else {
49        std::cout << "  (device not in use by any known process)\n";
50      }
51      printAttribute(deviceMap, IPUAttributeLabels::BoardIpuIndex);
52      printAttribute(deviceMap, IPUAttributeLabels::SerialNumber);
53      printAttribute(deviceMap, IPUAttributeLabels::ClockFrequency);
54      printAttribute(deviceMap, IPUAttributeLabels::TotalBoardPower);
55      printAttribute(deviceMap, IPUAttributeLabels::AverageBoardTemp);
56      printAttribute(deviceMap, IPUAttributeLabels::IpuUtilisation);
57      printAttribute(deviceMap, IPUAttributeLabels::MaxActiveCodeSize);
58      printAttribute(deviceMap, IPUAttributeLabels::MaxActiveDataSize);
59      printAttribute(deviceMap, IPUAttributeLabels::MaxActiveStackSize);
60      std::cout << "\n";
61    }
62    sleep(1);
63  }
64
65  return 0;
66}

gc_get_attributes_for_all_devices.cpp

7.2.2. Go

 1package main
 2
 3import (
 4	"os"
 5	"fmt"
 6	"time"
 7	"gcipuinfo"
 8)
 9
10func printAttribute(deviceMap map[string]interface{}, key string) {
11	if deviceMap[key] != nil {
12		fmt.Println("  ", key, " : ", deviceMap[key])
13	}
14}
15
16func main() {
17	discoveryMode := gcipuinfo.DiscoverActivePartitionIPUs
18	if len(os.Args) > 1 {
19		if os.Args[1] == "--all-partitions" {
20			discoveryMode = gcipuinfo.DiscoverAllPartitionIPUs
21		} else {
22			fmt.Println("Error, unrecognised option.")
23			fmt.Println("Specify --all-partitions to show IPUs in all partitions")
24			os.Exit(1)
25		}
26	}
27
28	inventory := gcipuinfo.NewGcipuinfo(discoveryMode)
29
30	count := 0
31	for {
32		fmt.Println("*** Iteration ", count, " ***\n")
33		count++
34
35		inventory.UpdateData()
36		deviceMaps := inventory.GetDevices()
37		for _, attribs := range deviceMaps {
38			deviceMap := attribs.(map[string]interface{})
39
40			fmt.Println("Device ", deviceMap[gcipuinfo.DeviceId])
41
42			if deviceMap[gcipuinfo.UserProcessId] != nil {
43				printAttribute(deviceMap, gcipuinfo.UserProcessId)
44				printAttribute(deviceMap, gcipuinfo.UserExecutable)
45				printAttribute(deviceMap, gcipuinfo.UserName)
46			} else {
47				fmt.Println("  (device not in use by any known process)")
48			}
49			printAttribute(deviceMap, gcipuinfo.BoardIpuIndex)
50			printAttribute(deviceMap, gcipuinfo.SerialNumber)
51			printAttribute(deviceMap, gcipuinfo.ClockFrequency)
52			printAttribute(deviceMap, gcipuinfo.TotalBoardPower)
53			printAttribute(deviceMap, gcipuinfo.AverageBoardTemp)
54			printAttribute(deviceMap, gcipuinfo.IpuUtilisation)
55			printAttribute(deviceMap, gcipuinfo.MaxActiveCodeSize)
56			printAttribute(deviceMap, gcipuinfo.MaxActiveDataSize)
57			printAttribute(deviceMap, gcipuinfo.MaxActiveStackSize)
58		}
59		fmt.Println("")
60		time.Sleep(1 * time.Second)
61	}
62}

gc_get_attributes_for_all_devices.go

7.3. Graph power consumption of IPUs

7.3.1. Python

  1import argparse
  2import time
  3import sys
  4import math
  5import os
  6
  7# Requires asciichartpy: pip3 install --user asciichartpy
  8import asciichartpy
  9import gcipuinfo
 10
 11ipu_info = gcipuinfo.gcipuinfo()
 12num_devices = len(ipu_info.getDevices())
 13if num_devices == 0:
 14    print("gc_power_consumption.py: error: no IPUs detected", file=sys.stderr)
 15    exit(-1)
 16
 17
 18def get_ipu_power_single(device_id):
 19    if 0 <= device_id and device_id < num_devices:
 20        return pow_to_float(
 21            ipu_info.getNamedAttributeForDevice(device_id, gcipuinfo.IpuPower)
 22        )
 23    else:
 24        print(
 25            f"gc_power_consumption.py: error: device id {device_id} does not exist (valid range is 0-{num_devices-1})",
 26            file=sys.stderr,
 27        )
 28        exit(-1)
 29
 30
 31def get_ipu_power_from_device_list(devices):
 32    powers = []
 33    for device_id in devices:
 34        pow = ipu_info.getNamedAttributeForDevice(device_id, gcipuinfo.IpuPower)
 35        if pow != "N/A":
 36            powers.append(pow_to_float(pow))
 37    return powers
 38
 39def get_ipu_power_all():
 40    device_powers = ipu_info.getNamedAttributeForAll(gcipuinfo.IpuPower)
 41    return [pow_to_float(pow) for pow in device_powers if pow != "N/A"]
 42
 43
 44def pow_to_float(pow):
 45    # Power is reported in the format xxx.xxW, so remove the last character.
 46    # We also handle the case when the power reports as N/A.
 47    try:
 48        return float(pow[:-1])
 49    except ValueError:
 50        return 0
 51
 52
 53def draw_graph(power_history, mode, num_devices, device_ids, min, max, width, height):
 54    graph_cfg = {
 55        "height": height - 3,  # Leave room for the title at the top
 56        "format": "{:8.2f}W ",
 57        "min": min if min else 0,
 58    }
 59    if max and max > graph_cfg["min"]:
 60        graph_cfg["max"] = max
 61
 62    if device_ids:
 63        title_str = mode.capitalize() + " power consumption for IPUs: " + ", ".join(map(str, device_ids))
 64    else:
 65        title_str = mode.capitalize() + " power consumption for " + str(num_devices) + " IPUs"
 66
 67    print(title_str.center(width))
 68    graph = asciichartpy.plot(power_history, graph_cfg) + "\n"
 69    sys.stdout.buffer.write(graph.encode("utf-8"))
 70    sys.stdout.flush()
 71
 72
 73def main():
 74    parser = argparse.ArgumentParser(description="Display a console graph of IPU power consumption over time")
 75    parser.add_argument("--min", type=float, help="Minimum y-axis value, in watts")
 76    parser.add_argument("--max", type=float, help="Maximum y-axis value, in watts")
 77    parser.add_argument(
 78        "--interval",
 79        type=float,
 80        default=1,
 81        help="Interval between power queries, in seconds",
 82    )
 83    parser.add_argument(
 84        "--devices", type=int, nargs="+", help="only query specific devices"
 85    )
 86    parser.add_argument('--mode',  help='Simulator IPU architecture',
 87                        choices=["mean", "total"],
 88                        required=False, default="mean")
 89
 90    # This example assumes per-IPU power sensors, which are not available on
 91    # C2 devices
 92    if ipu_info.getNamedAttributeForDevice(0, gcipuinfo.BoardType) != "M2000":
 93        print("This program is only supported on IPU-Machine devices")
 94        sys.exit(1)
 95
 96    args = parser.parse_args()
 97
 98    try:
 99        term_width, term_height = os.get_terminal_size()
100    except OSError:
101        print(
102            "gc_power_consumption.py: warning: stdout is not attached to a tty, using 50x50 graph",
103            file=sys.stderr,
104        )
105        term_width, term_height = 50
106
107    power_history = []
108    max_entries = term_width - 15  # Leave enough room for the y-axis labels
109
110    while True:
111        if not args.devices:
112            powers = get_ipu_power_all()
113        else:
114            powers = get_ipu_power_from_device_list(args.devices)
115        if len(powers) > 0:
116            if args.mode == "mean":
117                val = sum(powers) / len(powers)
118            else:
119                val = sum(powers)
120            power_history.append(val)
121        if len(power_history) > max_entries:
122            power_history = power_history[1:]
123
124        if any([power != 0 for power in power_history]):
125            draw_graph(
126                power_history,
127                args.mode,
128                len(powers),
129                args.devices,
130                min=args.min,
131                max=args.max,
132                width=term_width,
133                height=term_height,
134            )
135        else:
136            print("  -- Waiting for devices to power on...", end="\r")
137
138        time.sleep(args.interval)
139
140
141if __name__ == "__main__":
142    main()

gc_power_consumption.py

7.4. Display a message when device 0 is in use

Example output when device 0 is idle:

$ go run gc_track_device_0.go
Device 0 is idle

Example output when device 0 is in use:

$ go run gc_track_device_0.go
User exampleuser is running application gc-powertest on device 0

7.4.1. C++

 1#include <iostream>
 2#include <string>
 3
 4#include "graphcore_target_access/gcipuinfo/IPUAttributeLabels.h"
 5#include "graphcore_target_access/gcipuinfo/gcipuinfo.h"
 6
 7int main() {
 8  gcipuinfo inventory;
 9  int deviceOfInterest = 0; // device id
10
11  std::string exeName = inventory.getNamedAttributeForDevice(
12      deviceOfInterest, IPUAttributeLabels::UserExecutable);
13  std::string userName = inventory.getNamedAttributeForDevice(
14      deviceOfInterest, IPUAttributeLabels::UserName);
15
16  if (!exeName.empty()) {
17    std::cout << "User " << userName << " is running application " << exeName
18              << " on device " << deviceOfInterest << "\n";
19  } else {
20    std::cout << "Device " << deviceOfInterest << " is idle\n";
21  }
22
23  return 0;
24}

gc_track_device_0.cpp

7.4.2. Python

 1import gcipuinfo
 2
 3inventory = gcipuinfo.gcipuinfo()
 4device_of_interest = 0
 5exe_name = inventory.getNamedAttributeForDevice(device_of_interest, gcipuinfo.UserExecutable)
 6user_name = inventory.getNamedAttributeForDevice(device_of_interest, gcipuinfo.UserName)
 7
 8if exe_name:
 9  print("User " + user_name + " is running " + exe_name + " on device " + str(device_of_interest))
10else:
11  print("Device " + str(device_of_interest) + " is idle")

gc_track_device_0.py

7.4.3. Go

 1package main
 2
 3import (
 4	"fmt"
 5	"gcipuinfo"
 6)
 7
 8func main() {
 9
10	inventory := gcipuinfo.NewGcipuinfo()
11	deviceOfInterest := 0 // device id
12	exeName := inventory.GetNamedAttributeForDevice(deviceOfInterest, gcipuinfo.UserExecutable)
13	userName := inventory.GetNamedAttributeForDevice(deviceOfInterest, gcipuinfo.UserName)
14	if len(exeName) > 0 {
15		fmt.Printf("User %s is running application %s on device %d\n", userName, exeName, deviceOfInterest)
16	} else {
17		fmt.Printf("Device %d is idle\n", deviceOfInterest)
18	}
19}

gc_track_device_0.go

7.5. Display application event record entries

This example program demonstrates how to retrieve and interpret application event records. The program takes one or more application event record paths as command line parameters and displays the contents of any events it finds in the specified paths, along with a list of all IPU-Machine hosts named in at least one event.

Note

For demonstration purposes there is a write_example_event_record_entry.py script included in the examples/python directory which can write example application event record entries.

Example output when run with two application event records that are both empty:

$ python gc_event_record.py /tmp/ipu_app_event_record_1 /tmp/ipu_app_event_record_2
== Checking application event record:  /tmp/ipu_app_event_record_1
No event record entry found.
== Checking application event record:  /tmp/ipu_app_event_record_2
No event record entry found.

Example output when there is an entry in /tmp/ipu_app_event_record_1 but not in /tmp/ipu_app_event_record_2:

$ python gc_event_record.py /tmp/ipu_app_event_record_1 /tmp/ipu_app_event_record_2
== Checking application event record: /tmp/ipu_app_event_record_1
{
  "attached ipu hosts": [
    "10.1.5.10"
  ],
  "attached ipus": [
    0,
    1
  ],
  "command line": "./example program with args",
  "description": "A nonrecoverable error has occurred.",
  "event record path": "/tmp/ipu_app_event_record_1/last_event.json",
  "partition": "p1",
  "pid": "35024",
  "severity": "nonrecoverable",
  "specific ipu hosts": [
    "10.1.5.10"
  ],
  "specific ipus": [
    0
  ],
  "timestamp": "2021-11-29T17:23:54.989896Z"
}
Last event was an error.

== Checking application event record: /tmp/ipu_app_event_record_2
No event record found.

== IPU-Machine hosts involved in errors, across all application event records:
  10.1.5.10

7.5.1. C++

 1#include <iostream>
 2#include <set>
 3
 4#include "graphcore_target_access/gcipuinfo/gcipuinfo.h"
 5#include <nlohmann/json.hpp>
 6using json = nlohmann::json;
 7
 8int main(int argc, char *argv[]) {
 9
10  if (argc < 2) {
11    std::cerr << __FILE__ << ": error requires at least one argument\n";
12    return 1;
13  }
14
15  gcipuinfo inventory;
16
17  std::set<std::string> hostsNamedInErrors;
18
19  for (int i = 1; i < argc; i++) {
20    std::cout << "== Checking application event record: " << argv[i] << "\n";
21
22    std::string eventRecordJSON =
23        inventory.getLastAppEventRecordAsJSON(gcipuinfo::EventSevNone, argv[i]);
24
25    json j = json::parse(eventRecordJSON);
26    if (j.empty()) {
27      std::cout << "No event record entry found.\n";
28    } else {
29      std::cout << j.dump(2) << "\n";
30
31      // If the event has named specific IPUs
32      // record their associated machine names
33      bool foundSpecificHosts = false;
34      if (j.contains(gcipuinfo::keySpecificIPUHosts)) {
35        auto hosts = j[gcipuinfo::keySpecificIPUHosts];
36        foundSpecificHosts = hosts.size() > 0;
37        for (const auto &element : hosts.items()) {
38          hostsNamedInErrors.insert(element.value().dump());
39        }
40      }
41      if (!foundSpecificHosts && j.contains(gcipuinfo::keyAttachedIPUHosts)) {
42        // if no specific IPUs have been mentioned in this error,
43        // fall back to recording hosts of all attached IPUs
44        auto hosts = j[gcipuinfo::keyAttachedIPUHosts];
45        for (const auto &element : hosts.items()) {
46          hostsNamedInErrors.insert(element.value().dump());
47        }
48      }
49      auto severity = inventory.getLastAppEventRecordSeverity(argv[i]);
50      if (severity > gcipuinfo::EventSevWarning) {
51        std::cout << "Last event was an error.\n";
52      }
53    }
54    std::cout << "\n";
55  }
56
57  if (hostsNamedInErrors.size() > 0) {
58    std::cout << "== IPU-Machine hosts involved in errors, across all "
59                 "application event records:\n";
60    for (auto host : hostsNamedInErrors) {
61      std::cout << "  " << host << "\n";
62    }
63  }
64
65  return 0;
66}

gc_event_record.cpp

7.5.2. Python

 1import sys
 2import json
 3import gcipuinfo
 4
 5if len(sys.argv) < 2:
 6  print("Error, requires at least one argument")
 7  sys.exit(1)
 8
 9inventory = gcipuinfo.gcipuinfo()
10
11hosts_named_in_errors = set()
12
13for i in range(1,len(sys.argv)):
14  path = sys.argv[i]
15  print("== Checking application event record: " + path)
16  event_record_json = inventory.getLastAppEventRecordAsJSON(inventory.EventSevNone, path)
17  j = json.loads(event_record_json)
18  if not j:
19    print("No event record entry found.")
20  else:
21    print(event_record_json)
22
23    # If the event has named specific IPUs
24    # record their associated machine names
25    found_specific_hosts = False
26    if inventory.keySpecificIPUHosts in j:
27      hosts = j[inventory.keySpecificIPUHosts]
28      found_specific_hosts = len(hosts) > 0
29      for host in hosts:
30        hosts_named_in_errors.add(host)
31
32    # if no specific IPUs have been mentioned in this error,
33    # fall back to recording hosts of all attached IPUs
34    if not found_specific_hosts and inventory.keyAttachedIPUHosts in j:
35      hosts = j[inventory.keyAttachedIPUHosts]
36      for host in hosts:
37        hosts_named_in_errors.add(host)
38    if (inventory.getLastAppEventRecordSeverity(path) > inventory.EventSevWarning):
39      print("Last event was an error.")
40  print("")
41
42if len(hosts_named_in_errors) > 0:
43  print("== IPU-Machine hosts involved in errors, across all application event records:")
44  for host in hosts_named_in_errors:
45    print("  " + host)

gc_event_record.py

7.5.3. Go

 1package main
 2
 3import (
 4	"fmt"
 5	"os"
 6	"gcipuinfo"
 7	"encoding/json"
 8)
 9
10func main() {
11	inventory := gcipuinfo.NewGcipuinfo()
12
13	if len(os.Args) < 2 {
14		panic("Error, requires at least one argument")
15	}
16
17	hostsNamedInErrors := map[string]bool{}
18
19	for i := 1; i < len(os.Args); i++ {
20		path := os.Args[i]
21		fmt.Println("== Checking application event record: ", path)
22		eventRecordJSON := inventory.GetLastAppEventRecordAsJSON(gcipuinfo.EventSevNone, path)
23		var eventRecord map[string]interface{}
24		json.Unmarshal([]byte(eventRecordJSON), &eventRecord)
25		if len(eventRecord) == 0 {
26			fmt.Printf("No event record entry found.\n")
27		} else {
28			fmt.Println(eventRecordJSON)
29
30			// If the event has named specific IPUs
31			// record their associated machine names
32			foundSpecificHosts := false
33			if _, ok := eventRecord[gcipuinfo.KeySpecificIPUHosts]; ok {
34				hosts := eventRecord[gcipuinfo.KeySpecificIPUHosts].([]interface{})
35				foundSpecificHosts = len(hosts) > 0
36				for _, host := range hosts {
37					hostsNamedInErrors[fmt.Sprint(host)] = true
38				}
39			}
40			if !foundSpecificHosts {
41				// if no specific IPUs have been mentioned in this error,
42				// fall back to recording hosts of all attached IPUs
43				if _, ok := eventRecord[gcipuinfo.KeyAttachedIPUHosts]; ok {
44					hosts := eventRecord[gcipuinfo.KeyAttachedIPUHosts].([]interface{})
45					for _, host := range hosts {
46						hostsNamedInErrors[fmt.Sprint(host)] = true
47					}
48				}
49			}
50			severity := inventory.GetLastAppEventRecordSeverity(path);
51			if (severity > gcipuinfo.EventSevWarning) {
52				fmt.Println("Last event was an error.\n");
53			}
54		}
55	}
56	if len(hostsNamedInErrors) > 0 {
57		fmt.Printf("== IPU-Machine hosts involved in errors, across all application event records:\n");
58		for host, _ := range hostsNamedInErrors {
59			fmt.Printf("  %s\n", host)
60		}
61	}
62}

gc_event_record.go

7.6. Display device health-check result

This example runs forever displaying device health-check results, both as a raw JSON string and then, to demonstrate how to parse the output, in a summarised message. By default, the health checks will only be run on IPUs in the active partition. If you specify the --all-partitions flag, the gcipuinfo object is configured with the DiscoverAllPartitionIPUs option, and health checks will be run on IPUs in all known partitions.

Example output when there are no failing devices:

$ python gc_health_check.py
** Iteration 0 ***
Raw JSON:
{}
No errors

** Iteration 1 ***
Raw JSON:
{}
No errors

** Iteration 2 ***
Raw JSON:
{}
No errors
...

Example output when device 1 on 10.1.5.10 has failed:

$ python gc_health_check.py
** Iteration 0 ***
Raw JSON:
{
  "hosts": {
    "10.1.5.10": [
      {
        "error": "connection",
        "id": "1",
        "partition": "p1",
        "board ipu index": "2"
      }
    ]
  }
}

Parsed:
  host: 10.1.5.10
    device id: 1, partition: p1, error: connection


...

7.6.1. C++

 1#include <chrono>
 2#include <iostream>
 3#include <string.h>
 4#include <thread>
 5
 6#include "graphcore_target_access/gcipuinfo/gcipuinfo.h"
 7#include <nlohmann/json.hpp>
 8
 9using json = nlohmann::json;
10
11int main(int argc, char *argv[]) {
12
13  DeviceDiscoveryMode discoveryMode = DiscoverActivePartitionIPUs;
14  if (argc > 1) {
15    if (strcmp(argv[1], "--all-partitions") == 0) {
16      discoveryMode = DiscoverAllPartitionIPUs;
17    } else {
18      std::cerr << "Error, unrecognised option.\n";
19      std::cerr << "Specify --all-partitions to show IPUs in all partitions\n";
20      std::exit(1);
21    }
22  }
23  gcipuinfo inventory(discoveryMode);
24
25  unsigned count = 0;
26  unsigned numIPUs = inventory.getDevices().size();
27  while (true) {
28    std::cout << "*** Iteration " << count++ << " ***\n\n";
29    std::cout << "Checking " << numIPUs << " devices:\n";
30    // Set 100ms timeout on response from each IPU.  Don't check active IPUs.
31    std::string devicesHealth = inventory.checkHealthOfDevices(100, false);
32    std::cout << "Raw JSON: " << devicesHealth << "\n";
33    json j = json::parse(devicesHealth);
34    if (j.contains("hosts")) {
35      std::cout << "Parsed:\n";
36      auto hosts = j["hosts"];
37      for (auto host : hosts.items()) {
38        std::cout << "  host: " << host.key() << "\n";
39        for (auto device : host.value().items()) {
40          std::cout << "    device id: " << device.value()["id"]
41                    << ", partition: " << device.value()["partition"]
42                    << ", error: " << device.value()["error"] << "\n";
43        }
44      }
45    } else if (j.contains("error")) {
46      std::cout << "Parsed:\n";
47      std::cout << "    error: " << j["error"]
48                << ", description: " << j["description"] << "\n";
49    } else {
50      std::cout << "No errors\n";
51    }
52    std::cout << "\n";
53    std::this_thread::sleep_for(std::chrono::milliseconds(10));
54  }
55
56  return 0;
57}

gc_health_check.cpp

7.6.2. Python

 1import json
 2import time
 3import sys
 4import gcipuinfo
 5
 6discovery_mode = gcipuinfo.DiscoverActivePartitionIPUs
 7if len(sys.argv) > 1:
 8  if sys.argv[1] == "--all-partitions":
 9    discovery_mode = gcipuinfo.DiscoverAllPartitionIPUs
10  else:
11    print("Error, unrecognised option.")
12    print("Specify --all-partitions to show IPUs in all partitions")
13    sys.exit(1)
14
15inventory = gcipuinfo.gcipuinfo(discovery_mode)
16num_ipus = len(inventory.getDevices())
17
18count = 0
19while True:
20  print("** Iteration " + str(count) + " ***")
21  print("Checking " + str(num_ipus) + " devices")
22  devicesHealth = inventory.checkHealthOfDevices(100, False)
23  print("Raw JSON: \n" + devicesHealth)
24  j = json.loads(devicesHealth)
25  if "hosts" in j:
26    print("Parsed:")
27    hosts = j["hosts"]
28    for host_name in hosts:
29      print("  host: " + host_name)
30      for device in hosts[host_name]:
31        print("    device id: " + device["id"] + ", partition: " + device["partition"] + ", error: " + device["error"])
32  elif "error" in j:
33    print("Parsed:")
34    print("  error: " + j["error"] + ", description: " + j["description"])
35  else:
36    print("No errors")
37  print("")
38  count = count + 1
39  time.sleep(0.01)

gc_health_check.py

7.6.3. Go

 1package main
 2
 3import (
 4	"os"
 5	"fmt"
 6	"gcipuinfo"
 7	"encoding/json"
 8	"time"
 9)
10
11func main() {
12	discoveryMode := gcipuinfo.DiscoverActivePartitionIPUs
13	if len(os.Args) > 1 {
14		if os.Args[1] == "--all-partitions" {
15			discoveryMode = gcipuinfo.DiscoverAllPartitionIPUs
16		} else {
17			fmt.Println("Error, unrecognised option.")
18			fmt.Println("Specify --all-partitions to show IPUs in all partitions")
19			os.Exit(1)
20		}
21	}
22
23	inventory := gcipuinfo.NewGcipuinfo(discoveryMode)
24	numIPUs := len(inventory.GetDevices())
25
26	count := 0
27	for {
28		fmt.Println("*** Iteration ", count, " ***\n")
29		fmt.Println("Checking ", numIPUs, " devices\n")
30		count++
31
32		var devicesHealth string
33		devicesHealth = inventory.CheckHealthOfDevices(100, false)
34		fmt.Println("Raw JSON:", devicesHealth)
35
36		var result map[string]interface{}
37		json.Unmarshal([]byte(devicesHealth), &result)
38		if _, ok := result["hosts"]; ok {
39			fmt.Println("Parsed:")
40			hosts := result["hosts"].(map[string]interface{})
41			for hostName, hostVal := range hosts {
42				fmt.Println("  host: ", hostName)
43				devices := hostVal.([]interface{})
44				for _, deviceVal := range devices {
45					device := deviceVal.(map[string]interface{})
46					fmt.Println("    device id: ", device["id"], "partition: ", device["partition"], ", error: ", device["error"])
47				}
48			}
49		} else if _, ok := result["error"]; ok {
50			fmt.Println("Parsed:")
51			fmt.Println("  error: ", result["error"], ", description: ", result["description"])
52		} else {
53			fmt.Println("No errors")
54		}
55		time.Sleep(time.Duration(10) * time.Millisecond)
56	}
57}

gc_health_check.go