Softmax from scratch doesn't work


#1

Hi, In section “Softmax Regression from Scratch” I tried to run the code as in the book. However, the first for loop doesn’t iterate over the epoch. It just stop after one epoch. I tested the code and find out that this problem happens only when the softmax function is used in the second for loop. I have checked a few functions inside the second for loop and all of them worked. My codes are given below

Issue code:

def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
params=None, lr=None, trainer=None):
for epoch in range(num_epochs):
train_l_sum = 0
train_acc_sum = 0
i = 0
for X, y in train_iter:
i += 1
with autograd.record():
y_hat = net(X) # The code stops working when I included softmax function
l = loss(y_hat, y)
l.backward()
if trainer is None:
nb.sgd(params, lr, batch_size)
else:
trainer.step(batch_size) # This will be illustrated in the next section.
train_l_sum += l.mean().asscalar()
train_acc_sum += accuracy(y_hat, y)
test_acc = evaluate_accuracy(test_iter, net)
print(‘epoch %d, loss %.4f, train acc %.3f, test acc %.3f’
% (epoch + 1, train_l_sum / len(train_iter),
train_acc_sum / len(train_iter), test_acc))
print(epoch)

Working code

def test(net, train_iter, test_iter, loss, num_epochs, batch_size,
params=None, lr=None, trainer=None):
for epoch in range(num_epochs):
train_l_sum = 0
train_acc_sum = 0
i = 0
for X, y in train_iter:
if i % 50 == 0:
print(i)
i += 1
with autograd.record():
y_h = nd.array([2]).exp()
print(epoch)

Am I doing anything wrong in these programs?

Vinu


#2

I copied all code from original book into a single .py file, and it seems to work fine to me.

The only problem I experienced is related to multiprocessing of data loading - for some reason if I run the same code several times, at one run I receive an error message Pipe is broken, which doesn’t stop the training, but looks ugly. So, the only change I did to original code, is I copied the data loading code into my .py file to set num_workers to 0.

Try this code out.

import sys
import gluonbook as gb
import os
from mxnet import autograd, nd
from mxnet.gluon import data
batch_size = 256


def load_data_fashion_mnist(batch_size, resize=None, root=os.path.join(
        '~', '.mxnet', 'datasets', 'fashion-mnist')):
    """Download the fashion mnist dataset and then load into memory."""
    root = os.path.expanduser(root)
    transformer = []
    if resize:
        transformer += [data.vision.transforms.Resize(resize)]
    transformer += [data.vision.transforms.ToTensor()]
    transformer = data.vision.transforms.Compose(transformer)

    mnist_train = data.vision.FashionMNIST(root=root, train=True)
    mnist_test = data.vision.FashionMNIST(root=root, train=False)
    num_workers = 0

    train_iter = data.DataLoader(mnist_train.transform_first(transformer),
                                 batch_size, shuffle=True,
                                 num_workers=num_workers)
    test_iter = data.DataLoader(mnist_test.transform_first(transformer),
                                batch_size, shuffle=False,
                                num_workers=num_workers)
    return train_iter, test_iter


train_iter, test_iter = load_data_fashion_mnist(batch_size)

num_inputs = 784
num_outputs = 10

W = nd.random.normal(scale=0.01, shape=(num_inputs, num_outputs))
b = nd.zeros(num_outputs)

W.attach_grad()
b.attach_grad()

X = nd.array([[1, 2, 3], [4, 5, 6]])
X.sum(axis=0, keepdims=True), X.sum(axis=1, keepdims=True)


def softmax(X):
    X_exp = X.exp()
    partition = X_exp.sum(axis=1, keepdims=True)
    return X_exp / partition  # The broadcast mechanism is applied here.


X = nd.random.normal(shape=(2, 5))
X_prob = softmax(X)
X_prob, X_prob.sum(axis=1)


def net(X):
    return softmax(nd.dot(X.reshape((-1, num_inputs)), W) + b)


y_hat = nd.array([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y = nd.array([0, 2])
nd.pick(y_hat, y)


def cross_entropy(y_hat, y):
    return - nd.pick(y_hat, y).log()


def accuracy(y_hat, y):
    return (y_hat.argmax(axis=1) == y.astype('float32')).mean().asscalar()


accuracy(y_hat, y)


#  The function will be gradually improved: the complete implementation will be
# discussed in the "Image Augmentation" section.
def evaluate_accuracy(data_iter, net):
    acc = 0
    for X, y in data_iter:
        acc += accuracy(net(X), y)
    return acc / len(data_iter)


evaluate_accuracy(test_iter, net)

num_epochs, lr = 5, 0.1


def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
              params=None, lr=None, trainer=None):
    for epoch in range(num_epochs):
        train_l_sum = 0
        train_acc_sum = 0
        for X, y in train_iter:
            with autograd.record():
                y_hat = net(X)
                l = loss(y_hat, y)
            l.backward()
            if trainer is None:
                gb.sgd(params, lr, batch_size)
            else:
                trainer.step(batch_size)  # This will be illustrated in the next section.
            train_l_sum += l.mean().asscalar()
            train_acc_sum += accuracy(y_hat, y)
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
              % (epoch + 1, train_l_sum / len(train_iter),
                 train_acc_sum / len(train_iter), test_acc))


train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr)


for X, y in test_iter:
    break

true_labels = gb.get_fashion_mnist_labels(y.asnumpy())
pred_labels = gb.get_fashion_mnist_labels(net(X).argmax(axis=1).asnumpy())
titles = [truelabel + '\n' + predlabel for truelabel, predlabel in zip(true_labels, pred_labels)]

gb.show_fashion_mnist(X[0:9], titles[0:9])

If it still doesn’t work for you, check your version of MXNet - I use 1.4.0.


#3

That is great, it works now. Copying load_data_fashion_mnist to the script does the job. Thanks @Sergey.


#4

When I upgraded MXNet from v1.2.1 to v1.31 load_data_fashion_mnist() doesn’t need to include in the script. It works well as it is given in the book.