9. Examples
You can find PyTorch examples and tutorials in the Graphcore GitHub examples repository. This contains:
Examples of popular machine learning models for training and inference
Source code from videos, blogs and other documents
9.1. MNIST example
The example in Listing 9.1 shows how an MNIST model can be run on the IPU. The highlighted lines show the PopTorch-specific code required to run the example on multiple IPUs.
You can download the full source code from GitHub: mnist.py.
To run this example you will need to install the Poplar SDK (see the Getting Started Guide for your IPU system) and the appropriate version of torchvision
:
$ python3 -m pip install torchvision==0.11.1
16import torch
17import torch.nn as nn
18import torchvision
19import poptorch
20
21# Normal pytorch batch size
22training_batch_size = 20
23validation_batch_size = 100
24
25opts = poptorch.Options()
26# Device "step"
27opts.deviceIterations(20)
28
29# How many IPUs to replicate over.
30opts.replicationFactor(4)
31
32opts.randomSeed(42)
33
34# Load MNIST normally.
35training_data = poptorch.DataLoader(
36 opts,
37 torchvision.datasets.MNIST('mnist_data/',
38 train=True,
39 download=True,
40 transform=torchvision.transforms.Compose([
41 torchvision.transforms.ToTensor(),
42 torchvision.transforms.Normalize(
43 (0.1307, ), (0.3081, ))
44 ])),
45 batch_size=training_batch_size,
46 shuffle=True)
47
48# Load MNIST normally.
49val_options = poptorch.Options()
50validation_data = poptorch.DataLoader(
51 val_options,
52 torchvision.datasets.MNIST('mnist_data/',
53 train=True,
54 download=True,
55 transform=torchvision.transforms.Compose([
56 torchvision.transforms.ToTensor(),
57 torchvision.transforms.Normalize(
58 (0.1307, ), (0.3081, ))
59 ])),
60 batch_size=validation_batch_size,
61 shuffle=True,
62 drop_last=True)
63
64# A helper block to build convolution-pool-relu blocks.
65class Block(nn.Module):
66 def __init__(self, in_channels, num_filters, kernel_size, pool_size):
67 super(Block, self).__init__()
68 self.conv = nn.Conv2d(in_channels,
69 num_filters,
70 kernel_size=kernel_size)
71 self.pool = nn.MaxPool2d(kernel_size=pool_size)
72 self.relu = nn.ReLU()
73
74 def forward(self, x):
75 x = self.conv(x)
76 x = self.pool(x)
77 x = self.relu(x)
78 return x
79
80# Define the network using the above blocks.
81class Network(nn.Module):
82 def __init__(self):
83 super().__init__()
84 self.layer1 = Block(1, 10, 5, 2)
85 self.layer2 = Block(10, 20, 5, 2)
86 self.layer3 = nn.Linear(320, 256)
87 self.layer3_act = nn.ReLU()
88 self.layer4 = nn.Linear(256, 10)
89
90 self.softmax = nn.LogSoftmax(1)
91 self.loss = nn.NLLLoss(reduction="mean")
92
93 def forward(self, x, target=None):
94 x = self.layer1(x)
95 x = self.layer2(x)
96 x = x.view(-1, 320)
97
98 x = self.layer3_act(self.layer3(x))
99 x = self.layer4(x)
100 x = self.softmax(x)
101
102 if target is not None:
103 loss = self.loss(x, target)
104 return x, loss
105 return x
106
107# Create our model.
108model = Network()
109
110# Create model for training which will run on IPU.
111training_model = poptorch.trainingModel(model, training_data.options)
112
113# Same model as above, they will share weights (in 'model') which once training is finished can be copied back.
114inference_model = poptorch.inferenceModel(model, validation_data.options)
115
116def train():
117 for batch_number, (data, labels) in enumerate(training_data):
118 output, losses = training_model(data, labels)
119
120 if batch_number % 10 == 0:
121 print(f"PoptorchIPU loss at batch: {batch_number} is {losses}")
122
123 # Pick the highest probability.
124 _, ind = torch.max(output, 1)
125 assert training_model.options.output_mode in (
126 poptorch.OutputMode.All, poptorch.OutputMode.Final
127 ), "Only 'Final' and 'All' OutputMode supported"
128 # If we're using Final: only keep the last labels, no-op if using All
129 num_labels = ind.shape[0]
130 labels = labels[-num_labels:]
131 eq = torch.eq(ind, labels)
132 elms, counts = torch.unique(eq,
133 sorted=False,
134 return_counts=True)
135
136 acc = 0.0
137 if len(elms) == 2:
138 if elms[0]:
139 acc = (counts[0].item() / num_labels) * 100.0
140 else:
141 acc = (counts[1].item() / num_labels) * 100.0
142
143 print(
144 f"Training accuracy: {acc}% from batch of size {num_labels}"
145 )
146 print("Done training")
147
148def test():
149 correct = 0
150 total = 0
151 with torch.no_grad():
152 for (data, labels) in validation_data:
153 output = inference_model(data)
154
155 # Argmax the probabilities to get the highest.
156 _, ind = torch.max(output, 1)
157
158 # Compare it against the ground truth for this batch.
159 eq = torch.eq(ind, labels)
160
161 # Count the number which are True and the number which are False.
162 elms, counts = torch.unique(eq,
163 sorted=False,
164 return_counts=True)
165
166 if len(elms) == 2 or elms[0]:
167 if elms[0]:
168 correct += counts[0].item()
169 else:
170 correct += counts[1].item()
171
172 total += validation_batch_size
173 print("Validation: of " + str(total) + " samples we got: " +
174 str((correct / total) * 100.0) + "% correct")
175
176# Train on IPU.
177train()
178
179test()