Getting valueerror while fitting the LSTMmodel


#1

I am new to MXNet and doing handwritten word recognition task. I have data as numpy array of images of size (150, 150) and labels as sparse matrix obtained from one hot encoding of labels list (list of text in the image).My model consists of 4 conv layers followed by bi-LSTM layer and CTC loss function. I am getting error while fitting model. Error is as follows:

“ValueError: Shape of labels 1 does not match shape of predictions 2”

I think the error I am getting is in metrics. But unable to find solution. Can anyone please help me with that issue…

My script is:

import mxnet as mx
batch_size = 50
train_iter = mx.io.NDArrayIter(X_train, y_train.toarray(), batch_size, shuffle=True)
val_iter = mx.io.NDArrayIter(X_valid, y_valid.toarray(), batch_size)
test_iter = mx.io.NDArrayIter(X_test, y_test.toarray(), batch_size)

#Training set (69190, 1, 150, 150) (69190, 13550)
#Valid set (23064, 1, 150, 150) (23064, 13550)
#Test set (23064, 1, 150, 150) (23064, 13550)

import mxnet as mx
import numpy as np
import time
import math

def cnn_lstm_ctc(seq_len, num_hidden, num_classes, dropout=0.):

# input
data = mx.sym.Variable('data')
label = mx.sym.Variable('softmax_label')
data = mx.sym.L2Normalization(data)
#CNN model
# Layer 1
# first conv layer
conv1 = mx.sym.Convolution(data=data, kernel=(5,5), num_filter=16)
relu1 = mx.sym.Activation(data=conv1, act_type="relu")
#pool1 = mx.sym.Pooling(data=act1, pool_type="max", kernel=(2,2), stride=(2,2))
#d_out1 = mx.sym.Dropout(data=pool2, p=dropout)
# second conv layer
conv2 = mx.sym.Convolution(data=relu1, kernel=(5,5), num_filter=32)
relu2 = mx.sym.Activation(data=conv2, act_type="relu")
pool2 = mx.sym.Pooling(data=relu2, pool_type="max", kernel=(2,2), stride=(2,2))
l2_norm2 = mx.sym.L2Normalization(pool2)
d_out2 = mx.sym.Dropout(data=l2_norm2, p=dropout)
#third conv layer
conv3 = mx.sym.Convolution(data=d_out2, kernel=(5,5), num_filter=64)
relu3 = mx.sym.Activation(data=conv3, act_type="relu")
pool3 = mx.sym.Pooling(data=relu3, pool_type="max", kernel=(2,2), stride=(2,2))
#d_out3 = mx.sym.Dropout(data=pool3, p=dropout)
#fourth layer
conv4 = mx.sym.Convolution(data = pool3, kernel=(5,5), num_filter=128)
relu4 = mx.sym.Activation(data=conv4, act_type="relu")
pool4 = mx.sym.Pooling(data=relu4, pool_type="max", kernel=(2,2), stride=(2,2))
l2_norm4 = mx.sym.L2Normalization(pool4)
d_out4 = mx.sym.Dropout(data=l2_norm4, p=dropout)

arg_shape, output_shape, aux_shape = d_out4.infer_shape(data=(50, 1, 150, 150))
print(output_shape)

cnn_out = mx.sym.transpose(data=d_out4, axes=(0,3,1,2), name="cnn_out")
arg_shape, output_shape, aux_shape = cnn_out.infer_shape(data=(50,1,150,150))
print(output_shape)
flatten_out = mx.sym.Flatten(data=cnn_out, name="flatten_out")
arg_shape, output_shape, aux_shape = flatten_out.infer_shape(data=(50,1,150,150))
print(output_shape)
wordvec = mx.sym.split(data=flatten_out, num_outputs=seq_len, axis = 1)
arg_shape, output_shape, aux_shape = wordvec.infer_shape(data=(50,1,150,150))
print("wordvec {}".format(output_shape))

stack = mx.rnn.SequentialRNNCell()
fw_lstm = mx.rnn.LSTMCell(num_hidden = 200, forget_bias = 1.0, prefix='lstm_fw_')
bw_lstm = mx.rnn.LSTMCell(num_hidden = 200, forget_bias = 1.0, prefix='lstm_bw_')
cell = mx.rnn.BidirectionalCell(fw_lstm, bw_lstm)
stack.add(cell)
outputs, states = stack.unroll(length = seq_len, inputs = list(wordvec), merge_outputs=True)
arg_shape, output_shape, aux_shape = outputs.infer_shape(data=(50,1,150,150))
print(output_shape)
outputs = mx.sym.Reshape(data = outputs, shape = (-1, 400))
arg_shape, output_shape, aux_shape = outputs.infer_shape(data=(50,1,150,150))
print(output_shape)
pred = mx.sym.FullyConnected(data=outputs, num_hidden=num_classes)
arg_shape, output_shape, aux_shape = pred.infer_shape(data=(50,1,150,150))
print(output_shape)

# MXNet CTC
pred_ctc = mx.sym.Reshape(data=pred, shape=(num_classes, batch_size, -1))
arg_shape, output_shape, aux_shape = pred_ctc.infer_shape(data=(50,1,150,150))
print(output_shape)


loss = mx.sym.contrib.ctc_loss(data=pred_ctc, label=label, )
ctc_loss = mx.sym.MakeLoss(loss)

softmax_class = mx.symbol.SoftmaxActivation(data=pred, name = 'softmax')
softmax_loss = mx.sym.MakeLoss(softmax_class)
softmax_loss = mx.sym.BlockGrad(softmax_loss)
sm = mx.sym.Group([softmax_loss, ctc_loss])

return sm

def ctc_label§:
ret = []
p1 = [0] + p
for i in range(len§):
c1 = p1[i]
c2 = p1[i+1]
if c2 == 0 or c2 == c1:
continue
ret.append(c2)
return ret

def remove_blank(l):
ret = []
for i in range(len(l)):
if l[i] == 0:
break
ret.append(l[i])
return ret

def Accuracy(label, pred):
global batch_size
global seq_length
hit = 0.
total = 0.
for i in range(batch_size):
l = remove_blank(label[i])
p = []
for k in range(seq_length):
p.append(np.argmax(pred[k * batch_size + i]))
p = ctc_label§
if len§ == len(l):
match = True
for k in range(len§):
if p[k] != int(l[k]):
match = False
break
if match:
hit += 1.0
total += 1.0
return hit / total

import logging

log_file_name = “CNN_LSTM_CNN_MXNet.log”
log_file = open(log_file_name, ‘w’)
log_file.close()
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)
fh = logging.FileHandler(log_file_name)
logger.addHandler(fh)
prefix = os.path.join(os.getcwd(), ‘model’, ‘lstm_ctc’)

seq_len = 32
num_hidden = 200
num_classes = 13550
num_lstm_layer = 1
batch_size = 50

sym = cnn_lstm_ctc(seq_len, num_hidden = num_hidden, num_classes=num_classes, dropout = 0)
model = mx.mod.Module(symbol = sym, context = mx.gpu(0))

head = ‘%(asctime)-15s %(message)s’
logging.basicConfig(level=logging.DEBUG, format=head)
logger.info(‘begin fit’)

model.fit(train_data = train_iter,
eval_data = val_iter,
eval_metric = mx.metric.np(Accuracy),
num_epoch = 10,
optimizer = ‘Adam’,
optimizer_params = {‘learning_rate’: 0.01},
#optimizer_params = {‘learning_rate’: 0.01, ‘momentum’: 0.9},
initializer = mx.init.Xavier(factor_type=“in”, magnitude=2.34),
epoch_end_callback = mx.callback.do_checkpoint(prefix),
batch_end_callback = mx.callback.Speedometer(batch_size, 10))

Thanks in advance
Harathi