I have been trying to run the following code based on the Gluon image classification tutorial dive_deep_cifar10
While it produces expected outputs, the process never ends even if it reaches the end of the code (saving an image test.png).
The similar code in pytorch exits properly when it reaches the EOL. Would anyone help me resolve the issue?
Thank you
from __future__ import division
from gluoncv import data, utils
from matplotlib import pyplot as plt
import gluoncv as gcv
from gluoncv.data import VOCDetection
from matplotlib import pyplot as plt
from gluoncv.utils import viz
import argparse, time, logging, random, math
import numpy as np
import mxnet as mx
from mxnet import gluon, nd
from mxnet import autograd as ag
from mxnet.gluon import nn
from mxnet.gluon.data.vision import transforms
from gluoncv.model_zoo import get_model
from gluoncv.utils import makedirs, TrainingHistory
from gluoncv.data import transforms as gcv_transforms
def test(ctx, val_data):
metric = mx.metric.Accuracy()
for i, batch in enumerate(val_data):
data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
outputs = [net(X) for X in data]
metric.update(label, outputs)
return metric.get()
if __name__=="__main__":
train_dataset = data.VOCDetection(splits=[(2007, 'trainval'), (2012, 'trainval')])
val_dataset = data.VOCDetection(splits=[(2007, 'test')])
num_gpus = 1
ctx = [mx.gpu(i) for i in range(num_gpus)]
# Get the model CIFAR_ResNet20_v1, with 10 output classes, without pre-trained weights
net = get_model('cifar_resnet20_v1', classes=10)
net.initialize(mx.init.Xavier(), ctx = ctx)
transform_train = transforms.Compose([
# Randomly crop an area and resize it to be 32x32, then pad it to be 40x40
gcv_transforms.RandomCrop(32, pad=4),
# Randomly flip the image horizontally
transforms.RandomFlipLeftRight(),
# Transpose the image from height*width*num_channels to num_channels*height*width
# and map values from [0, 255] to [0,1]
transforms.ToTensor(),
# Normalize the image with mean and standard deviation calculated across all images
transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])
transform_test = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])
# Batch Size for Each GPU
per_device_batch_size = 128
# Number of data loader workers
num_workers = 8
# Calculate effective total batch size
batch_size = per_device_batch_size * num_gpus
# Set train=True for training data
# Set shuffle=True to shuffle the training data
train_data = gluon.data.DataLoader(
gluon.data.vision.CIFAR10(train=True).transform_first(transform_train),
batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)
# Set train=False for validation data
val_data = gluon.data.DataLoader(
gluon.data.vision.CIFAR10(train=False).transform_first(transform_test),
batch_size=batch_size, shuffle=False, num_workers=num_workers)
# Learning rate decay factor
lr_decay = 0.1
# Epochs where learning rate decays
lr_decay_epoch = [80, 160, np.inf]
# Nesterov accelerated gradient descent
optimizer = 'nag'
# Set parameters
optimizer_params = {'learning_rate': 0.1, 'wd': 0.0001, 'momentum': 0.9}
# Define our trainer for net
trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
train_metric = mx.metric.Accuracy()
train_history = TrainingHistory(['training-error', 'validation-error'])
epochs = 3
lr_decay_count = 0
for epoch in range(epochs):
tic = time.time()
train_metric.reset()
train_loss = 0
# Learning rate decay
if epoch == lr_decay_epoch[lr_decay_count]:
trainer.set_learning_rate(trainer.learning_rate*lr_decay)
lr_decay_count += 1
# Loop through each batch of training data
for i, batch in enumerate(train_data):
# Extract data and label
data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
# AutoGrad
with ag.record():
output = [net(X) for X in data]
loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
# Backpropagation
for l in loss:
l.backward()
# Optimize
trainer.step(batch_size)
# Update metrics
train_loss += sum([l.sum().asscalar() for l in loss])
train_metric.update(label, output)
name, acc = train_metric.get()
# Evaluate on Validation data
name, val_acc = test(ctx, val_data)
# Update history and print metrics
train_history.update([1-acc, 1-val_acc])
print('[Epoch %d] train=%f val=%f loss=%f time: %f' %
(epoch, acc, val_acc, train_loss, time.time()-tic))
# We can plot the metric scores with:
train_history.plot(save_path="test.png")
The output is the following as expected:
[Epoch 0] train=0.472676 val=0.606800 loss=72228.871788 time: 19.881742
[Epoch 1] train=0.663442 val=0.681100 loss=47372.536560 time: 19.804182
[Epoch 2] train=0.731751 val=0.734900 loss=37964.900452 time: 19.507770
ps -a
PID TTY TIME CMD
964 tty1 00:00:01 Xorg
1112 tty1 00:00:00 gnome-session-b
1764 tty2 00:01:00 Xorg
1777 tty2 00:00:00 gnome-session-b
10275 pts/1 00:01:23 python
10315 pts/1 00:00:09 python
10324 pts/1 00:00:09 python
10333 pts/1 00:00:09 python
10342 pts/1 00:00:09 python
10351 pts/1 00:00:09 python
10360 pts/1 00:00:09 python
10369 pts/1 00:00:09 python
10378 pts/1 00:00:09 python
10391 pts/1 00:00:00 python
10400 pts/1 00:00:00 python
10409 pts/1 00:00:00 python
10418 pts/1 00:00:00 python
10427 pts/1 00:00:00 python
10436 pts/1 00:00:00 python
10445 pts/1 00:00:00 python
10454 pts/1 00:00:00 python
12313 pts/1 00:00:00 ps