7. Examples
7.1. Listing running applications
This example provides a list of the user processes currently running on IPUs within the system.
Example output:
$ gc-active-apps
Graphcore active apps: 1
o application #0: [gc-powertest]
7.1.1. Python
1import sys
2import gcipuinfo
3
4discovery_mode = gcipuinfo.DiscoverActivePartitionIPUs
5if len(sys.argv) > 1:
6 if sys.argv[1] == "--all-partitions":
7 discovery_mode = gcipuinfo.DiscoverAllPartitionIPUs
8 else:
9 print("Error, unrecognised option.")
10 print("Specify --all-partitions to show IPUs in all partitions")
11 sys.exit(1)
12
13inventory = gcipuinfo.gcipuinfo(discovery_mode)
14apps = inventory.getNamedAttributeForAll(gcipuinfo.UserExecutable)
15
16print("Graphcore active apps: " + str(len(apps)))
17for index, exe_name in enumerate(apps):
18 if exe_name:
19 print(" o application #{}: [{}]".format(index, exe_name))
7.1.2. C++
1#include <iostream>
2#include <string.h>
3
4#include "graphcore_target_access/gcipuinfo/IPUAttributeLabels.h"
5#include "graphcore_target_access/gcipuinfo/gcipuinfo.h"
6
7int main(int argc, char *argv[]) {
8
9 DeviceDiscoveryMode discoveryMode = DiscoverActivePartitionIPUs;
10 if (argc > 1) {
11 if (strcmp(argv[1], "--all-partitions") == 0) {
12 discoveryMode = DiscoverAllPartitionIPUs;
13 } else {
14 std::cerr << "Error, unrecognised option.\n";
15 std::cerr << "Specify --all-partitions to show IPUs in all partitions\n";
16 std::exit(1);
17 }
18 }
19 gcipuinfo inventory(discoveryMode);
20
21 auto apps =
22 inventory.getNamedAttributeForAll(IPUAttributeLabels::UserExecutable);
23
24 std::cout << "Graphcore active apps: " << apps.size() << "\n";
25 unsigned index = 0;
26 for (auto &exeName : apps) {
27
28 if (exeName.size()) {
29 std::cout << " o application #" << index++ << ": [" << exeName << "]\n";
30 }
31 index++;
32 }
33
34 return 0;
35}
7.1.3. Go
1package main
2
3import (
4 "os"
5 "fmt"
6 "gcipuinfo"
7)
8
9func main() {
10
11 discoveryMode := gcipuinfo.DiscoverActivePartitionIPUs
12 if len(os.Args) > 1 {
13 if os.Args[1] == "--all-partitions" {
14 discoveryMode = gcipuinfo.DiscoverAllPartitionIPUs
15 } else {
16 fmt.Println("Error, unrecognised option.")
17 fmt.Println("Specify --all-partitions to show IPUs in all partitions")
18 os.Exit(1)
19 }
20 }
21
22 inventory := gcipuinfo.NewGcipuinfo(discoveryMode)
23 var exeNames []string = inventory.GetNamedAttributeForAll(gcipuinfo.UserExecutable)
24 fmt.Println("Graphcore active apps: ", len(exeNames))
25 for index, exeName := range exeNames {
26 if len(exeName) > 0 {
27 fmt.Printf(" o application #%d: [%s]\n", index, exeName)
28 }
29 }
30}
7.2. Listing a collection of attributes across all IPUs in the system
This example loops over the IPUs in the system and displays a subset of
the attributes for each device.
It runs forever, polling for updated attribute values.
By default, only IPUs in the currently active partition or GCD are displayed.
If you specify the --all-partitions
flag, the gcipuinfo object is configured with
the DiscoverAllPartitionIPUs
option, which will retrieve information for IPUs
in all known partitions.
Example output:
$ gc-get-attributes-for-all-devices
*** Iteration 0 ***
Device 0
user process id : 62931
user executable : ./example
user name : ipuuser
board ipu index : 3
board serial number : 0026.0002.8203321
clock : 1300MHz
total board power : 46.0 C
average board temp : 41.3 C
ipu utilisation : 100.00%
max active code size (bytes) : 36420
max active data size (bytes) : 318
max active stack size (bytes) : 1344
Device 1
(device not in use by any known process)
board ipu index : 2
board serial number : 0026.0002.8203321
clock : 1300MHz
total board power : N/A
average board temp : N/A
ipu utilisation : 0.00%
Device 2
(device not in use by any known process)
board ipu index : 1
board serial number : 0026.0001.8203321
clock : 1300MHz
total board power : N/A
average board temp : N/A
ipu utilisation : 0.00%
7.2.1. C++
1#include <iostream>
2#include <string.h>
3#include <unistd.h>
4
5#include "graphcore_target_access/gcipuinfo/IPUAttributeLabels.h"
6#include "graphcore_target_access/gcipuinfo/gcipuinfo.h"
7
8void printAttribute(const std::map<std::string, std::string> &map,
9 const std::string &key) {
10 if (map.count(key) == 0)
11 return;
12 std::cout << " " << key << " : " << map.at(key) << "\n";
13}
14
15int main(int argc, char *argv[]) {
16
17 DeviceDiscoveryMode discoveryMode = DiscoverActivePartitionIPUs;
18 if (argc > 1) {
19 if (strcmp(argv[1], "--all-partitions") == 0) {
20 discoveryMode = DiscoverAllPartitionIPUs;
21 } else {
22 std::cerr << "Error, unrecognised option.\n";
23 std::cerr << "Specify --all-partitions to show IPUs in all partitions\n";
24 std::exit(1);
25 }
26 }
27 gcipuinfo inventory(discoveryMode);
28
29 std::cout << "Devices:\n";
30
31 unsigned count = 0;
32 while (1) {
33 std::cout << "*** Iteration " << count++ << " ***\n\n";
34
35 inventory.updateData(); // Refresh the device attribute values
36
37 auto deviceMaps = inventory.getDevices();
38 for (auto deviceMap : deviceMaps) {
39 std::cout << "Device " << deviceMap.at(IPUAttributeLabels::DeviceId)
40 << "\n";
41
42 // Only devices that are in use will have a process id associated with
43 // them
44 if (deviceMap.count(IPUAttributeLabels::UserProcessId)) {
45 printAttribute(deviceMap, IPUAttributeLabels::UserProcessId);
46 printAttribute(deviceMap, IPUAttributeLabels::UserExecutable);
47 printAttribute(deviceMap, IPUAttributeLabels::UserName);
48 } else {
49 std::cout << " (device not in use by any known process)\n";
50 }
51 printAttribute(deviceMap, IPUAttributeLabels::BoardIpuIndex);
52 printAttribute(deviceMap, IPUAttributeLabels::SerialNumber);
53 printAttribute(deviceMap, IPUAttributeLabels::ClockFrequency);
54 printAttribute(deviceMap, IPUAttributeLabels::TotalBoardPower);
55 printAttribute(deviceMap, IPUAttributeLabels::AverageBoardTemp);
56 printAttribute(deviceMap, IPUAttributeLabels::IpuUtilisation);
57 printAttribute(deviceMap, IPUAttributeLabels::MaxActiveCodeSize);
58 printAttribute(deviceMap, IPUAttributeLabels::MaxActiveDataSize);
59 printAttribute(deviceMap, IPUAttributeLabels::MaxActiveStackSize);
60 std::cout << "\n";
61 }
62 sleep(1);
63 }
64
65 return 0;
66}
7.2.2. Go
1package main
2
3import (
4 "os"
5 "fmt"
6 "time"
7 "gcipuinfo"
8)
9
10func printAttribute(deviceMap map[string]interface{}, key string) {
11 if deviceMap[key] != nil {
12 fmt.Println(" ", key, " : ", deviceMap[key])
13 }
14}
15
16func main() {
17 discoveryMode := gcipuinfo.DiscoverActivePartitionIPUs
18 if len(os.Args) > 1 {
19 if os.Args[1] == "--all-partitions" {
20 discoveryMode = gcipuinfo.DiscoverAllPartitionIPUs
21 } else {
22 fmt.Println("Error, unrecognised option.")
23 fmt.Println("Specify --all-partitions to show IPUs in all partitions")
24 os.Exit(1)
25 }
26 }
27
28 inventory := gcipuinfo.NewGcipuinfo(discoveryMode)
29
30 count := 0
31 for {
32 fmt.Println("*** Iteration ", count, " ***\n")
33 count++
34
35 inventory.UpdateData()
36 deviceMaps := inventory.GetDevices()
37 for _, attribs := range deviceMaps {
38 deviceMap := attribs.(map[string]interface{})
39
40 fmt.Println("Device ", deviceMap[gcipuinfo.DeviceId])
41
42 if deviceMap[gcipuinfo.UserProcessId] != nil {
43 printAttribute(deviceMap, gcipuinfo.UserProcessId)
44 printAttribute(deviceMap, gcipuinfo.UserExecutable)
45 printAttribute(deviceMap, gcipuinfo.UserName)
46 } else {
47 fmt.Println(" (device not in use by any known process)")
48 }
49 printAttribute(deviceMap, gcipuinfo.BoardIpuIndex)
50 printAttribute(deviceMap, gcipuinfo.SerialNumber)
51 printAttribute(deviceMap, gcipuinfo.ClockFrequency)
52 printAttribute(deviceMap, gcipuinfo.TotalBoardPower)
53 printAttribute(deviceMap, gcipuinfo.AverageBoardTemp)
54 printAttribute(deviceMap, gcipuinfo.IpuUtilisation)
55 printAttribute(deviceMap, gcipuinfo.MaxActiveCodeSize)
56 printAttribute(deviceMap, gcipuinfo.MaxActiveDataSize)
57 printAttribute(deviceMap, gcipuinfo.MaxActiveStackSize)
58 }
59 fmt.Println("")
60 time.Sleep(1 * time.Second)
61 }
62}
7.3. Graph power consumption of IPUs
7.3.1. Python
1import argparse
2import time
3import sys
4import math
5import os
6
7# Requires asciichartpy: pip3 install --user asciichartpy
8import asciichartpy
9import gcipuinfo
10
11ipu_info = gcipuinfo.gcipuinfo()
12num_devices = len(ipu_info.getDevices())
13if num_devices == 0:
14 print("gc_power_consumption.py: error: no IPUs detected", file=sys.stderr)
15 exit(-1)
16
17
18def get_ipu_power_single(device_id):
19 if 0 <= device_id and device_id < num_devices:
20 return pow_to_float(
21 ipu_info.getNamedAttributeForDevice(device_id, gcipuinfo.IpuPower)
22 )
23 else:
24 print(
25 f"gc_power_consumption.py: error: device id {device_id} does not exist (valid range is 0-{num_devices-1})",
26 file=sys.stderr,
27 )
28 exit(-1)
29
30
31def get_ipu_power_from_device_list(devices):
32 powers = []
33 for device_id in devices:
34 pow = ipu_info.getNamedAttributeForDevice(device_id, gcipuinfo.IpuPower)
35 if pow != "N/A":
36 powers.append(pow_to_float(pow))
37 return powers
38
39def get_ipu_power_all():
40 device_powers = ipu_info.getNamedAttributeForAll(gcipuinfo.IpuPower)
41 return [pow_to_float(pow) for pow in device_powers if pow != "N/A"]
42
43
44def pow_to_float(pow):
45 # Power is reported in the format xxx.xxW, so remove the last character.
46 # We also handle the case when the power reports as N/A.
47 try:
48 return float(pow[:-1])
49 except ValueError:
50 return 0
51
52
53def draw_graph(power_history, mode, num_devices, device_ids, min, max, width, height):
54 graph_cfg = {
55 "height": height - 3, # Leave room for the title at the top
56 "format": "{:8.2f}W ",
57 "min": min if min else 0,
58 }
59 if max and max > graph_cfg["min"]:
60 graph_cfg["max"] = max
61
62 if device_ids:
63 title_str = mode.capitalize() + " power consumption for IPUs: " + ", ".join(map(str, device_ids))
64 else:
65 title_str = mode.capitalize() + " power consumption for " + str(num_devices) + " IPUs"
66
67 print(title_str.center(width))
68 graph = asciichartpy.plot(power_history, graph_cfg) + "\n"
69 sys.stdout.buffer.write(graph.encode("utf-8"))
70 sys.stdout.flush()
71
72
73def main():
74 parser = argparse.ArgumentParser(description="Display a console graph of IPU power consumption over time")
75 parser.add_argument("--min", type=float, help="Minimum y-axis value, in watts")
76 parser.add_argument("--max", type=float, help="Maximum y-axis value, in watts")
77 parser.add_argument(
78 "--interval",
79 type=float,
80 default=1,
81 help="Interval between power queries, in seconds",
82 )
83 parser.add_argument(
84 "--devices", type=int, nargs="+", help="only query specific devices"
85 )
86 parser.add_argument('--mode', help='Simulator IPU architecture',
87 choices=["mean", "total"],
88 required=False, default="mean")
89
90 # This example assumes per-IPU power sensors, which are not available on
91 # C2 devices
92 if ipu_info.getNamedAttributeForDevice(0, gcipuinfo.BoardType) != "M2000":
93 print("This program is only supported on IPU-Machine devices")
94 sys.exit(1)
95
96 args = parser.parse_args()
97
98 try:
99 term_width, term_height = os.get_terminal_size()
100 except OSError:
101 print(
102 "gc_power_consumption.py: warning: stdout is not attached to a tty, using 50x50 graph",
103 file=sys.stderr,
104 )
105 term_width, term_height = 50
106
107 power_history = []
108 max_entries = term_width - 15 # Leave enough room for the y-axis labels
109
110 while True:
111 if not args.devices:
112 powers = get_ipu_power_all()
113 else:
114 powers = get_ipu_power_from_device_list(args.devices)
115 if len(powers) > 0:
116 if args.mode == "mean":
117 val = sum(powers) / len(powers)
118 else:
119 val = sum(powers)
120 power_history.append(val)
121 if len(power_history) > max_entries:
122 power_history = power_history[1:]
123
124 if any([power != 0 for power in power_history]):
125 draw_graph(
126 power_history,
127 args.mode,
128 len(powers),
129 args.devices,
130 min=args.min,
131 max=args.max,
132 width=term_width,
133 height=term_height,
134 )
135 else:
136 print(" -- Waiting for devices to power on...", end="\r")
137
138 time.sleep(args.interval)
139
140
141if __name__ == "__main__":
142 main()
7.4. Display a message when device 0 is in use
Example output when device 0 is idle:
$ go run gc_track_device_0.go
Device 0 is idle
Example output when device 0 is in use:
$ go run gc_track_device_0.go
User exampleuser is running application gc-powertest on device 0
7.4.1. C++
1#include <iostream>
2#include <string>
3
4#include "graphcore_target_access/gcipuinfo/IPUAttributeLabels.h"
5#include "graphcore_target_access/gcipuinfo/gcipuinfo.h"
6
7int main() {
8 gcipuinfo inventory;
9 int deviceOfInterest = 0; // device id
10
11 std::string exeName = inventory.getNamedAttributeForDevice(
12 deviceOfInterest, IPUAttributeLabels::UserExecutable);
13 std::string userName = inventory.getNamedAttributeForDevice(
14 deviceOfInterest, IPUAttributeLabels::UserName);
15
16 if (!exeName.empty()) {
17 std::cout << "User " << userName << " is running application " << exeName
18 << " on device " << deviceOfInterest << "\n";
19 } else {
20 std::cout << "Device " << deviceOfInterest << " is idle\n";
21 }
22
23 return 0;
24}
7.4.2. Python
1import gcipuinfo
2
3inventory = gcipuinfo.gcipuinfo()
4device_of_interest = 0
5exe_name = inventory.getNamedAttributeForDevice(device_of_interest, gcipuinfo.UserExecutable)
6user_name = inventory.getNamedAttributeForDevice(device_of_interest, gcipuinfo.UserName)
7
8if exe_name:
9 print("User " + user_name + " is running " + exe_name + " on device " + str(device_of_interest))
10else:
11 print("Device " + str(device_of_interest) + " is idle")
7.4.3. Go
1package main
2
3import (
4 "fmt"
5 "gcipuinfo"
6)
7
8func main() {
9
10 inventory := gcipuinfo.NewGcipuinfo()
11 deviceOfInterest := 0 // device id
12 exeName := inventory.GetNamedAttributeForDevice(deviceOfInterest, gcipuinfo.UserExecutable)
13 userName := inventory.GetNamedAttributeForDevice(deviceOfInterest, gcipuinfo.UserName)
14 if len(exeName) > 0 {
15 fmt.Printf("User %s is running application %s on device %d\n", userName, exeName, deviceOfInterest)
16 } else {
17 fmt.Printf("Device %d is idle\n", deviceOfInterest)
18 }
19}
7.5. Display application event record entries
This example program demonstrates how to retrieve and interpret application event records. The program takes one or more application event record paths as command line parameters and displays the contents of any events it finds in the specified paths, along with a list of all IPU-Machine hosts named in at least one event.
Note
For demonstration purposes there is a write_example_event_record_entry.py
script included in the examples/python
directory which can write example
application event record entries.
Example output when run with two application event records that are both empty:
$ python gc_event_record.py /tmp/ipu_app_event_record_1 /tmp/ipu_app_event_record_2
== Checking application event record: /tmp/ipu_app_event_record_1
No event record entry found.
== Checking application event record: /tmp/ipu_app_event_record_2
No event record entry found.
Example output when there is an entry in /tmp/ipu_app_event_record_1
but not in /tmp/ipu_app_event_record_2
:
$ python gc_event_record.py /tmp/ipu_app_event_record_1 /tmp/ipu_app_event_record_2
== Checking application event record: /tmp/ipu_app_event_record_1
{
"attached ipu hosts": [
"10.1.5.10"
],
"attached ipus": [
0,
1
],
"command line": "./example program with args",
"description": "A nonrecoverable error has occurred.",
"event record path": "/tmp/ipu_app_event_record_1/last_event.json",
"partition": "p1",
"pid": "35024",
"severity": "nonrecoverable",
"specific ipu hosts": [
"10.1.5.10"
],
"specific ipus": [
0
],
"timestamp": "2021-11-29T17:23:54.989896Z"
}
Last event was an error.
== Checking application event record: /tmp/ipu_app_event_record_2
No event record found.
== IPU-Machine hosts involved in errors, across all application event records:
10.1.5.10
7.5.1. C++
1#include <iostream>
2#include <set>
3
4#include "graphcore_target_access/gcipuinfo/gcipuinfo.h"
5#include <nlohmann/json.hpp>
6using json = nlohmann::json;
7
8int main(int argc, char *argv[]) {
9
10 if (argc < 2) {
11 std::cerr << __FILE__ << ": error requires at least one argument\n";
12 return 1;
13 }
14
15 gcipuinfo inventory;
16
17 std::set<std::string> hostsNamedInErrors;
18
19 for (int i = 1; i < argc; i++) {
20 std::cout << "== Checking application event record: " << argv[i] << "\n";
21
22 std::string eventRecordJSON =
23 inventory.getLastAppEventRecordAsJSON(gcipuinfo::EventSevNone, argv[i]);
24
25 json j = json::parse(eventRecordJSON);
26 if (j.empty()) {
27 std::cout << "No event record entry found.\n";
28 } else {
29 std::cout << j.dump(2) << "\n";
30
31 // If the event has named specific IPUs
32 // record their associated machine names
33 bool foundSpecificHosts = false;
34 if (j.contains(gcipuinfo::keySpecificIPUHosts)) {
35 auto hosts = j[gcipuinfo::keySpecificIPUHosts];
36 foundSpecificHosts = hosts.size() > 0;
37 for (const auto &element : hosts.items()) {
38 hostsNamedInErrors.insert(element.value().dump());
39 }
40 }
41 if (!foundSpecificHosts && j.contains(gcipuinfo::keyAttachedIPUHosts)) {
42 // if no specific IPUs have been mentioned in this error,
43 // fall back to recording hosts of all attached IPUs
44 auto hosts = j[gcipuinfo::keyAttachedIPUHosts];
45 for (const auto &element : hosts.items()) {
46 hostsNamedInErrors.insert(element.value().dump());
47 }
48 }
49 auto severity = inventory.getLastAppEventRecordSeverity(argv[i]);
50 if (severity > gcipuinfo::EventSevWarning) {
51 std::cout << "Last event was an error.\n";
52 }
53 }
54 std::cout << "\n";
55 }
56
57 if (hostsNamedInErrors.size() > 0) {
58 std::cout << "== IPU-Machine hosts involved in errors, across all "
59 "application event records:\n";
60 for (auto host : hostsNamedInErrors) {
61 std::cout << " " << host << "\n";
62 }
63 }
64
65 return 0;
66}
7.5.2. Python
1import sys
2import json
3import gcipuinfo
4
5if len(sys.argv) < 2:
6 print("Error, requires at least one argument")
7 sys.exit(1)
8
9inventory = gcipuinfo.gcipuinfo()
10
11hosts_named_in_errors = set()
12
13for i in range(1,len(sys.argv)):
14 path = sys.argv[i]
15 print("== Checking application event record: " + path)
16 event_record_json = inventory.getLastAppEventRecordAsJSON(inventory.EventSevNone, path)
17 j = json.loads(event_record_json)
18 if not j:
19 print("No event record entry found.")
20 else:
21 print(event_record_json)
22
23 # If the event has named specific IPUs
24 # record their associated machine names
25 found_specific_hosts = False
26 if inventory.keySpecificIPUHosts in j:
27 hosts = j[inventory.keySpecificIPUHosts]
28 found_specific_hosts = len(hosts) > 0
29 for host in hosts:
30 hosts_named_in_errors.add(host)
31
32 # if no specific IPUs have been mentioned in this error,
33 # fall back to recording hosts of all attached IPUs
34 if not found_specific_hosts and inventory.keyAttachedIPUHosts in j:
35 hosts = j[inventory.keyAttachedIPUHosts]
36 for host in hosts:
37 hosts_named_in_errors.add(host)
38 if (inventory.getLastAppEventRecordSeverity(path) > inventory.EventSevWarning):
39 print("Last event was an error.")
40 print("")
41
42if len(hosts_named_in_errors) > 0:
43 print("== IPU-Machine hosts involved in errors, across all application event records:")
44 for host in hosts_named_in_errors:
45 print(" " + host)
7.5.3. Go
1package main
2
3import (
4 "fmt"
5 "os"
6 "gcipuinfo"
7 "encoding/json"
8)
9
10func main() {
11 inventory := gcipuinfo.NewGcipuinfo()
12
13 if len(os.Args) < 2 {
14 panic("Error, requires at least one argument")
15 }
16
17 hostsNamedInErrors := map[string]bool{}
18
19 for i := 1; i < len(os.Args); i++ {
20 path := os.Args[i]
21 fmt.Println("== Checking application event record: ", path)
22 eventRecordJSON := inventory.GetLastAppEventRecordAsJSON(gcipuinfo.EventSevNone, path)
23 var eventRecord map[string]interface{}
24 json.Unmarshal([]byte(eventRecordJSON), &eventRecord)
25 if len(eventRecord) == 0 {
26 fmt.Printf("No event record entry found.\n")
27 } else {
28 fmt.Println(eventRecordJSON)
29
30 // If the event has named specific IPUs
31 // record their associated machine names
32 foundSpecificHosts := false
33 if _, ok := eventRecord[gcipuinfo.KeySpecificIPUHosts]; ok {
34 hosts := eventRecord[gcipuinfo.KeySpecificIPUHosts].([]interface{})
35 foundSpecificHosts = len(hosts) > 0
36 for _, host := range hosts {
37 hostsNamedInErrors[fmt.Sprint(host)] = true
38 }
39 }
40 if !foundSpecificHosts {
41 // if no specific IPUs have been mentioned in this error,
42 // fall back to recording hosts of all attached IPUs
43 if _, ok := eventRecord[gcipuinfo.KeyAttachedIPUHosts]; ok {
44 hosts := eventRecord[gcipuinfo.KeyAttachedIPUHosts].([]interface{})
45 for _, host := range hosts {
46 hostsNamedInErrors[fmt.Sprint(host)] = true
47 }
48 }
49 }
50 severity := inventory.GetLastAppEventRecordSeverity(path);
51 if (severity > gcipuinfo.EventSevWarning) {
52 fmt.Println("Last event was an error.\n");
53 }
54 }
55 }
56 if len(hostsNamedInErrors) > 0 {
57 fmt.Printf("== IPU-Machine hosts involved in errors, across all application event records:\n");
58 for host, _ := range hostsNamedInErrors {
59 fmt.Printf(" %s\n", host)
60 }
61 }
62}
7.6. Display device health-check result
This example runs forever displaying device health-check results, both as a raw JSON string and then, to demonstrate
how to parse the output, in a summarised message.
By default, the health checks will only be run on IPUs in the active partition.
If you specify the --all-partitions
flag, the gcipuinfo object is configured with
the DiscoverAllPartitionIPUs
option, and health checks will be run on IPUs in all
known partitions.
Example output when there are no failing devices:
$ python gc_health_check.py
** Iteration 0 ***
Raw JSON:
{}
No errors
** Iteration 1 ***
Raw JSON:
{}
No errors
** Iteration 2 ***
Raw JSON:
{}
No errors
...
Example output when device 1 on 10.1.5.10 has failed:
$ python gc_health_check.py
** Iteration 0 ***
Raw JSON:
{
"hosts": {
"10.1.5.10": [
{
"error": "connection",
"id": "1",
"partition": "p1",
"board ipu index": "2"
}
]
}
}
Parsed:
host: 10.1.5.10
device id: 1, partition: p1, error: connection
...
7.6.1. C++
1#include <chrono>
2#include <iostream>
3#include <string.h>
4#include <thread>
5
6#include "graphcore_target_access/gcipuinfo/gcipuinfo.h"
7#include <nlohmann/json.hpp>
8
9using json = nlohmann::json;
10
11int main(int argc, char *argv[]) {
12
13 DeviceDiscoveryMode discoveryMode = DiscoverActivePartitionIPUs;
14 if (argc > 1) {
15 if (strcmp(argv[1], "--all-partitions") == 0) {
16 discoveryMode = DiscoverAllPartitionIPUs;
17 } else {
18 std::cerr << "Error, unrecognised option.\n";
19 std::cerr << "Specify --all-partitions to show IPUs in all partitions\n";
20 std::exit(1);
21 }
22 }
23 gcipuinfo inventory(discoveryMode);
24
25 unsigned count = 0;
26 unsigned numIPUs = inventory.getDevices().size();
27 while (true) {
28 std::cout << "*** Iteration " << count++ << " ***\n\n";
29 std::cout << "Checking " << numIPUs << " devices:\n";
30 // Set 100ms timeout on response from each IPU. Don't check active IPUs.
31 std::string devicesHealth = inventory.checkHealthOfDevices(100, false);
32 std::cout << "Raw JSON: " << devicesHealth << "\n";
33 json j = json::parse(devicesHealth);
34 if (j.contains("hosts")) {
35 std::cout << "Parsed:\n";
36 auto hosts = j["hosts"];
37 for (auto host : hosts.items()) {
38 std::cout << " host: " << host.key() << "\n";
39 for (auto device : host.value().items()) {
40 std::cout << " device id: " << device.value()["id"]
41 << ", partition: " << device.value()["partition"]
42 << ", error: " << device.value()["error"] << "\n";
43 }
44 }
45 } else if (j.contains("error")) {
46 std::cout << "Parsed:\n";
47 std::cout << " error: " << j["error"]
48 << ", description: " << j["description"] << "\n";
49 } else {
50 std::cout << "No errors\n";
51 }
52 std::cout << "\n";
53 std::this_thread::sleep_for(std::chrono::milliseconds(10));
54 }
55
56 return 0;
57}
7.6.2. Python
1import json
2import time
3import sys
4import gcipuinfo
5
6discovery_mode = gcipuinfo.DiscoverActivePartitionIPUs
7if len(sys.argv) > 1:
8 if sys.argv[1] == "--all-partitions":
9 discovery_mode = gcipuinfo.DiscoverAllPartitionIPUs
10 else:
11 print("Error, unrecognised option.")
12 print("Specify --all-partitions to show IPUs in all partitions")
13 sys.exit(1)
14
15inventory = gcipuinfo.gcipuinfo(discovery_mode)
16num_ipus = len(inventory.getDevices())
17
18count = 0
19while True:
20 print("** Iteration " + str(count) + " ***")
21 print("Checking " + str(num_ipus) + " devices")
22 devicesHealth = inventory.checkHealthOfDevices(100, False)
23 print("Raw JSON: \n" + devicesHealth)
24 j = json.loads(devicesHealth)
25 if "hosts" in j:
26 print("Parsed:")
27 hosts = j["hosts"]
28 for host_name in hosts:
29 print(" host: " + host_name)
30 for device in hosts[host_name]:
31 print(" device id: " + device["id"] + ", partition: " + device["partition"] + ", error: " + device["error"])
32 elif "error" in j:
33 print("Parsed:")
34 print(" error: " + j["error"] + ", description: " + j["description"])
35 else:
36 print("No errors")
37 print("")
38 count = count + 1
39 time.sleep(0.01)
7.6.3. Go
1package main
2
3import (
4 "os"
5 "fmt"
6 "gcipuinfo"
7 "encoding/json"
8 "time"
9)
10
11func main() {
12 discoveryMode := gcipuinfo.DiscoverActivePartitionIPUs
13 if len(os.Args) > 1 {
14 if os.Args[1] == "--all-partitions" {
15 discoveryMode = gcipuinfo.DiscoverAllPartitionIPUs
16 } else {
17 fmt.Println("Error, unrecognised option.")
18 fmt.Println("Specify --all-partitions to show IPUs in all partitions")
19 os.Exit(1)
20 }
21 }
22
23 inventory := gcipuinfo.NewGcipuinfo(discoveryMode)
24 numIPUs := len(inventory.GetDevices())
25
26 count := 0
27 for {
28 fmt.Println("*** Iteration ", count, " ***\n")
29 fmt.Println("Checking ", numIPUs, " devices\n")
30 count++
31
32 var devicesHealth string
33 devicesHealth = inventory.CheckHealthOfDevices(100, false)
34 fmt.Println("Raw JSON:", devicesHealth)
35
36 var result map[string]interface{}
37 json.Unmarshal([]byte(devicesHealth), &result)
38 if _, ok := result["hosts"]; ok {
39 fmt.Println("Parsed:")
40 hosts := result["hosts"].(map[string]interface{})
41 for hostName, hostVal := range hosts {
42 fmt.Println(" host: ", hostName)
43 devices := hostVal.([]interface{})
44 for _, deviceVal := range devices {
45 device := deviceVal.(map[string]interface{})
46 fmt.Println(" device id: ", device["id"], "partition: ", device["partition"], ", error: ", device["error"])
47 }
48 }
49 } else if _, ok := result["error"]; ok {
50 fmt.Println("Parsed:")
51 fmt.Println(" error: ", result["error"], ", description: ", result["description"])
52 } else {
53 fmt.Println("No errors")
54 }
55 time.Sleep(time.Duration(10) * time.Millisecond)
56 }
57}