Softmax from scratch doesn't work

#1

Hi, In section “Softmax Regression from Scratch” I tried to run the code as in the book. However, the first for loop doesn’t iterate over the epoch. It just stop after one epoch. I tested the code and find out that this problem happens only when the softmax function is used in the second for loop. I have checked a few functions inside the second for loop and all of them worked. My codes are given below

Issue code:

def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
params=None, lr=None, trainer=None):
for epoch in range(num_epochs):
train_l_sum = 0
train_acc_sum = 0
i = 0
for X, y in train_iter:
i += 1
y_hat = net(X) # The code stops working when I included softmax function
l = loss(y_hat, y)
l.backward()
if trainer is None:
nb.sgd(params, lr, batch_size)
else:
trainer.step(batch_size) # This will be illustrated in the next section.
train_l_sum += l.mean().asscalar()
train_acc_sum += accuracy(y_hat, y)
test_acc = evaluate_accuracy(test_iter, net)
print(‘epoch %d, loss %.4f, train acc %.3f, test acc %.3f’
% (epoch + 1, train_l_sum / len(train_iter),
train_acc_sum / len(train_iter), test_acc))
print(epoch)

Working code

def test(net, train_iter, test_iter, loss, num_epochs, batch_size,
params=None, lr=None, trainer=None):
for epoch in range(num_epochs):
train_l_sum = 0
train_acc_sum = 0
i = 0
for X, y in train_iter:
if i % 50 == 0:
print(i)
i += 1
y_h = nd.array([2]).exp()
print(epoch)

Am I doing anything wrong in these programs?

Vinu

#2

I copied all code from original book into a single `.py` file, and it seems to work fine to me.

The only problem I experienced is related to multiprocessing of data loading - for some reason if I run the same code several times, at one run I receive an error message `Pipe is broken`, which doesn’t stop the training, but looks ugly. So, the only change I did to original code, is I copied the data loading code into my `.py` file to set `num_workers` to 0.

Try this code out.

``````import sys
import gluonbook as gb
import os
from mxnet.gluon import data
batch_size = 256

'~', '.mxnet', 'datasets', 'fashion-mnist')):
root = os.path.expanduser(root)
transformer = []
if resize:
transformer += [data.vision.transforms.Resize(resize)]
transformer += [data.vision.transforms.ToTensor()]
transformer = data.vision.transforms.Compose(transformer)

mnist_train = data.vision.FashionMNIST(root=root, train=True)
mnist_test = data.vision.FashionMNIST(root=root, train=False)
num_workers = 0

batch_size, shuffle=True,
num_workers=num_workers)
batch_size, shuffle=False,
num_workers=num_workers)
return train_iter, test_iter

num_inputs = 784
num_outputs = 10

W = nd.random.normal(scale=0.01, shape=(num_inputs, num_outputs))
b = nd.zeros(num_outputs)

X = nd.array([[1, 2, 3], [4, 5, 6]])
X.sum(axis=0, keepdims=True), X.sum(axis=1, keepdims=True)

def softmax(X):
X_exp = X.exp()
partition = X_exp.sum(axis=1, keepdims=True)
return X_exp / partition  # The broadcast mechanism is applied here.

X = nd.random.normal(shape=(2, 5))
X_prob = softmax(X)
X_prob, X_prob.sum(axis=1)

def net(X):
return softmax(nd.dot(X.reshape((-1, num_inputs)), W) + b)

y_hat = nd.array([[0.1, 0.3, 0.6], [0.3, 0.2, 0.5]])
y = nd.array([0, 2])
nd.pick(y_hat, y)

def cross_entropy(y_hat, y):
return - nd.pick(y_hat, y).log()

def accuracy(y_hat, y):
return (y_hat.argmax(axis=1) == y.astype('float32')).mean().asscalar()

accuracy(y_hat, y)

#  The function will be gradually improved: the complete implementation will be
# discussed in the "Image Augmentation" section.
def evaluate_accuracy(data_iter, net):
acc = 0
for X, y in data_iter:
acc += accuracy(net(X), y)
return acc / len(data_iter)

evaluate_accuracy(test_iter, net)

num_epochs, lr = 5, 0.1

def train_ch3(net, train_iter, test_iter, loss, num_epochs, batch_size,
params=None, lr=None, trainer=None):
for epoch in range(num_epochs):
train_l_sum = 0
train_acc_sum = 0
for X, y in train_iter:
y_hat = net(X)
l = loss(y_hat, y)
l.backward()
if trainer is None:
gb.sgd(params, lr, batch_size)
else:
trainer.step(batch_size)  # This will be illustrated in the next section.
train_l_sum += l.mean().asscalar()
train_acc_sum += accuracy(y_hat, y)
test_acc = evaluate_accuracy(test_iter, net)
print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
% (epoch + 1, train_l_sum / len(train_iter),
train_acc_sum / len(train_iter), test_acc))

train_ch3(net, train_iter, test_iter, cross_entropy, num_epochs, batch_size, [W, b], lr)

for X, y in test_iter:
break

true_labels = gb.get_fashion_mnist_labels(y.asnumpy())
pred_labels = gb.get_fashion_mnist_labels(net(X).argmax(axis=1).asnumpy())
titles = [truelabel + '\n' + predlabel for truelabel, predlabel in zip(true_labels, pred_labels)]

gb.show_fashion_mnist(X[0:9], titles[0:9])
``````

If it still doesn’t work for you, check your version of MXNet - I use 1.4.0.

#3

That is great, it works now. Copying load_data_fashion_mnist to the script does the job. Thanks @Sergey.

#4

When I upgraded MXNet from v1.2.1 to v1.31 load_data_fashion_mnist() doesn’t need to include in the script. It works well as it is given in the book.