MXNet process does not stop

I have been trying to run the following code based on the Gluon image classification tutorial dive_deep_cifar10

While it produces expected outputs, the process never ends even if it reaches the end of the code (saving an image test.png).

The similar code in pytorch exits properly when it reaches the EOL. Would anyone help me resolve the issue?
Thank you



from __future__ import division
from gluoncv import data, utils
from matplotlib import pyplot as plt

import gluoncv as gcv
from gluoncv.data import VOCDetection
from matplotlib import pyplot as plt
from gluoncv.utils import viz


import argparse, time, logging, random, math

import numpy as np
import mxnet as mx

from mxnet import gluon, nd
from mxnet import autograd as ag
from mxnet.gluon import nn
from mxnet.gluon.data.vision import transforms

from gluoncv.model_zoo import get_model
from gluoncv.utils import makedirs, TrainingHistory
from gluoncv.data import transforms as gcv_transforms



def test(ctx, val_data):
    metric = mx.metric.Accuracy()
    for i, batch in enumerate(val_data):
        data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
        label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
        outputs = [net(X) for X in data]
        metric.update(label, outputs)
    return metric.get()



if __name__=="__main__":

    train_dataset = data.VOCDetection(splits=[(2007, 'trainval'), (2012, 'trainval')])
    val_dataset = data.VOCDetection(splits=[(2007, 'test')])
    
    
    num_gpus = 1
    ctx = [mx.gpu(i) for i in range(num_gpus)]
    
    # Get the model CIFAR_ResNet20_v1, with 10 output classes, without pre-trained weights
    net = get_model('cifar_resnet20_v1', classes=10)
    net.initialize(mx.init.Xavier(), ctx = ctx)
    
    
    transform_train = transforms.Compose([
        # Randomly crop an area and resize it to be 32x32, then pad it to be 40x40
        gcv_transforms.RandomCrop(32, pad=4),
        # Randomly flip the image horizontally
        transforms.RandomFlipLeftRight(),
        # Transpose the image from height*width*num_channels to num_channels*height*width
        # and map values from [0, 255] to [0,1]
        transforms.ToTensor(),
        # Normalize the image with mean and standard deviation calculated across all images
        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
    ])
    
    
    
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
    ])
    
    
    # Batch Size for Each GPU
    per_device_batch_size = 128
    # Number of data loader workers
    num_workers = 8
    # Calculate effective total batch size
    batch_size = per_device_batch_size * num_gpus
    
    # Set train=True for training data
    # Set shuffle=True to shuffle the training data
    train_data = gluon.data.DataLoader(
        gluon.data.vision.CIFAR10(train=True).transform_first(transform_train),
        batch_size=batch_size, shuffle=True, last_batch='discard', num_workers=num_workers)
    
    # Set train=False for validation data
    val_data = gluon.data.DataLoader(
        gluon.data.vision.CIFAR10(train=False).transform_first(transform_test),
        batch_size=batch_size, shuffle=False, num_workers=num_workers)
    
    
    
    
    # Learning rate decay factor
    lr_decay = 0.1
    # Epochs where learning rate decays
    lr_decay_epoch = [80, 160, np.inf]
    
    # Nesterov accelerated gradient descent
    optimizer = 'nag'
    # Set parameters
    optimizer_params = {'learning_rate': 0.1, 'wd': 0.0001, 'momentum': 0.9}
    
    # Define our trainer for net
    trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
    
    
    loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
    
    
    train_metric = mx.metric.Accuracy()
    train_history = TrainingHistory(['training-error', 'validation-error'])
    
    
    
    epochs = 3
    lr_decay_count = 0
    
    for epoch in range(epochs):
        tic = time.time()
        train_metric.reset()
        train_loss = 0
    
        # Learning rate decay
        if epoch == lr_decay_epoch[lr_decay_count]:
            trainer.set_learning_rate(trainer.learning_rate*lr_decay)
            lr_decay_count += 1
    
        # Loop through each batch of training data
        for i, batch in enumerate(train_data):
            # Extract data and label
            data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
            label = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
    
            # AutoGrad
            with ag.record():
                output = [net(X) for X in data]
                loss = [loss_fn(yhat, y) for yhat, y in zip(output, label)]
    
            # Backpropagation
            for l in loss:
                l.backward()
    
            # Optimize
            trainer.step(batch_size)
    
            # Update metrics
            train_loss += sum([l.sum().asscalar() for l in loss])
            train_metric.update(label, output)
    
        name, acc = train_metric.get()
        # Evaluate on Validation data
        name, val_acc = test(ctx, val_data)
    
        # Update history and print metrics
        train_history.update([1-acc, 1-val_acc])
        print('[Epoch %d] train=%f val=%f loss=%f time: %f' %
            (epoch, acc, val_acc, train_loss, time.time()-tic))
    
    # We can plot the metric scores with:
    
    train_history.plot(save_path="test.png")
    
   

The output is the following as expected:

[Epoch 0] train=0.472676 val=0.606800 loss=72228.871788 time: 19.881742
[Epoch 1] train=0.663442 val=0.681100 loss=47372.536560 time: 19.804182
[Epoch 2] train=0.731751 val=0.734900 loss=37964.900452 time: 19.507770

ps -a

    PID TTY          TIME CMD
    964 tty1     00:00:01 Xorg
   1112 tty1     00:00:00 gnome-session-b
   1764 tty2     00:01:00 Xorg
   1777 tty2     00:00:00 gnome-session-b
  10275 pts/1    00:01:23 python
  10315 pts/1    00:00:09 python
  10324 pts/1    00:00:09 python
  10333 pts/1    00:00:09 python
  10342 pts/1    00:00:09 python
  10351 pts/1    00:00:09 python
  10360 pts/1    00:00:09 python
  10369 pts/1    00:00:09 python
  10378 pts/1    00:00:09 python
  10391 pts/1    00:00:00 python
  10400 pts/1    00:00:00 python
  10409 pts/1    00:00:00 python
  10418 pts/1    00:00:00 python
  10427 pts/1    00:00:00 python
  10436 pts/1    00:00:00 python
  10445 pts/1    00:00:00 python
  10454 pts/1    00:00:00 python
  12313 pts/1    00:00:00 ps

Unfortunately, I can not repo your problem with your script. What are versions of MXNet, gluoncv? And which OS did your run on?

Thank you for the reply.
I forgot to mention in my question: all python processes in the ps -a output
are due to the MXNet script that I posted.

Distributor ID:	Ubuntu
Description:	Ubuntu 20.04 LTS
Release:	20.04
Codename:	focal

MXNet version

In [1]: import mxnet as mx                                                                                                         

In [2]: mx.__version__                                                                                                             
Out[2]: '1.6.0'

In [3]: import gluoncv as gcv                                                                                                      

In [4]: gcv.__version__                                                                                                            
Out[4]: '0.7.0'