Getting error while fitting an LSTM model


#1

I am new to MXNet. I am getting error while fitting model. The model consists of 4 conv layers followed by bi-LSTM layer and CTC loss function. Error is as follows:

“MXNetError: [19:13:07] src/executor/graph_executor.cc:461: InferShape pass cannot decide shapes for the following arguments (0s means unknown dimensions). Please consider providing them as inputs:
l0_init_h: [], l1_init_h: [],”

l0_init_h, l1_init_h are hidden states which I defined here:

num_lstm_layer = 1
for i in range(num_lstm_layer*2):
last_states.append(LSTMState(c=mx.sym.Variable(“l%d_init_c” % i), h=
mx.sym.Variable(“l%d_init_h”% i)))

I am giving them as input to fully connected layer as follows where prev_state.h will be ‘10_init_h’
h2h = mx.sym.FullyConnected(data=prev_state.h,………………)

Even I am giving hidden state as input to the fully connected layer, I am getting error “Please consider providing them as inputs”.

Anyone please help me resolve this issue. Thanks in Advance


#2

Hi @harathi, thanks for posting here would mind posting the entire script so I can try to reproduce? What version of MXNet are you using?


#3

Hi @ThomasDelteil, thank you for the reply. I am using MXNet version 0.12.1.
Here is the code:

import mxnet as mx
import numpy as np
from collections import namedtuple
import time
import math
LSTMState = namedtuple(“LSTMState”, [“c”, “h”])
LSTMParam = namedtuple(“LSTMParam”, [“i2h_weight”, “i2h_bias”,
“h2h_weight”, “h2h_bias”])
LSTMModel = namedtuple(“LSTMModel”, [“rnn_exec”, “symbol”,
“init_states”, “last_states”, “forward_state”, “backward_state”,
“seq_data”, “seq_labels”, “seq_outputs”,
“param_blocks”])

def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0.):
“”“LSTM Cell symbol”""
if dropout > 0.:
indata = mx.sym.Dropout(data=indata, p=dropout)
i2h = mx.sym.FullyConnected(data=indata,
weight=param.i2h_weight,
bias=param.i2h_bias,
num_hidden=num_hidden * 4,
name=“t%d_l%d_i2h” % (seqidx, layeridx))
h2h = mx.sym.FullyConnected(data=prev_state.h,
weight=param.h2h_weight,
bias=param.h2h_bias,
num_hidden=num_hidden * 4,
name=“t%d_l%d_h2h” % (seqidx, layeridx))
gates = i2h + h2h
slice_gates = mx.sym.SliceChannel(gates, num_outputs=4,
name=“t%d_l%d_slice” % (seqidx, layeridx))
in_gate = mx.sym.Activation(slice_gates[0], act_type=“sigmoid”)
in_transform = mx.sym.Activation(slice_gates[1], act_type=“tanh”)
forget_gate = mx.sym.Activation(slice_gates[2], act_type=“sigmoid”)
out_gate = mx.sym.Activation(slice_gates[3], act_type=“sigmoid”)
next_c = (forget_gate * prev_state.c) + (in_gate * in_transform)
next_h = out_gate * mx.sym.Activation(next_c, act_type=“tanh”)
return LSTMState(c=next_c, h=next_h)

def cnn_lstm_ctc(num_lstm_layer, seq_len, num_hidden, num_classes, num_label, dropout=0.):

last_states = []
forward_param = []
backward_param = []
for i in range(num_lstm_layer*2):
  last_states.append(LSTMState(c = mx.sym.Variable("l%d_init_c" % i), h = mx.sym.Variable("l%d_init_h" % i)))
  if i % 2 == 0:
    forward_param.append(LSTMParam(i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
                              i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
                              h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
                              h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)))
  else:
    backward_param.append(LSTMParam(i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
                              i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
                              h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
                              h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)))

# input
data = mx.sym.Variable('data')
label = mx.sym.Variable('softmax_label')
data = mx.sym.L2Normalization(data)
#CNN model
# Layer 1
# first conv layer
conv1 = mx.sym.Convolution(data=data, kernel=(5,5), num_filter=16)
relu1 = mx.sym.Activation(data=conv1, act_type="relu")
#pool1 = mx.sym.Pooling(data=act1, pool_type="max", kernel=(2,2), stride=(2,2))
#d_out1 = mx.sym.Dropout(data=pool2, p=dropout)
# second conv layer
conv2 = mx.sym.Convolution(data=relu1, kernel=(5,5), num_filter=32)
relu2 = mx.sym.Activation(data=conv2, act_type="relu")
pool2 = mx.sym.Pooling(data=relu2, pool_type="max", kernel=(2,2), stride=(2,2))
l2_norm2 = mx.sym.L2Normalization(pool2)
d_out2 = mx.sym.Dropout(data=l2_norm2, p=dropout)
#third conv layer
conv3 = mx.sym.Convolution(data=d_out2, kernel=(5,5), num_filter=64)
relu3 = mx.sym.Activation(data=conv3, act_type="relu")
pool3 = mx.sym.Pooling(data=relu3, pool_type="max", kernel=(2,2), stride=(2,2))
#d_out3 = mx.sym.Dropout(data=pool3, p=dropout)
#fourth layer
conv4 = mx.sym.Convolution(data = pool3, kernel=(5,5), num_filter=128)
relu4 = mx.sym.Activation(data=conv4, act_type="relu")
pool4 = mx.sym.Pooling(data=relu4, pool_type="max", kernel=(2,2), stride=(2,2))
l2_norm4 = mx.sym.L2Normalization(pool4)
d_out4 = mx.sym.Dropout(data=l2_norm4, p=dropout)

arg_shape, output_shape, aux_shape = d_out4.infer_shape(data=(128, 1, 150, 150))
print(output_shape)

cnn_out = mx.sym.transpose(data=d_out4, axes=(0,3,1,2), name="cnn_out")
arg_shape, output_shape, aux_shape = cnn_out.infer_shape(data=(128,1,150,150))
print(output_shape)
flatten_out = mx.sym.Flatten(data=cnn_out, name="flatten_out")
arg_shape, output_shape, aux_shape = flatten_out.infer_shape(data=(128,1,150,150))
print(output_shape)
wordvec = mx.sym.split(data=flatten_out, num_outputs=seq_len, axis = 1)
arg_shape, output_shape, aux_shape = wordvec.infer_shape(data=(128,1,150,150))
print(output_shape)

forward_hidden = []
for seqidx in range(seq_len):
    hidden = wordvec[seqidx]
    for i in range(num_lstm_layer): 
        next_state = lstm(num_hidden, indata=hidden,
                        prev_state=last_states[2*i],
                        param=forward_param[i],
                        seqidx=seqidx, layeridx=0, dropout=dropout)
        hidden = next_state.h
        last_states[2*i] = next_state
    forward_hidden.append(hidden)

backward_hidden = []
for seqidx in range(seq_len):
    k = seq_len - seqidx - 1
    hidden = wordvec[k]
    for i in range(num_lstm_layer):
        next_state = lstm(num_hidden, indata=hidden,
                        prev_state=last_states[2*i + 1],
                        param=backward_param[i],
                        seqidx=k, layeridx=1,dropout=dropout)
        hidden = next_state.h
        last_states[2*i + 1] = next_state
    backward_hidden.insert(0, hidden)

    
hidden_all = []
for i in range(seq_len):
    hidden_all.append(mx.sym.Concat(*[forward_hidden[i], backward_hidden[i]], dim=1))

hidden_concat = mx.sym.Concat(*hidden_all, dim=0)
pred = mx.sym.FullyConnected(data=hidden_concat, num_hidden=num_classes)
#arg_shape, output_shape, aux_shape = pred.infer_shape(data=(128,1,150,150))
#print(output_shape)

# WRAP CTC
#label = mx.sym.Reshape(data=label, shape=(-1,))
#label = mx.sym.Cast(data = label, dtype = 'int32')
#sm = mx.sym.WarpCTC(data=pred, label=label, label_length = num_label, input_length = seq_len)

# MXNet CTC
pred_ctc = mx.sym.Reshape(data=pred, shape=(num_classes, batch_size, seq_len))
#arg_shape, output_shape, aux_shape = pred_ctc.infer_shape(data=(128,1,150,150))
#print(output_shape)
#arg_shape, output_shape, aux_shape = label.infer_shape(data=(128,1,150,150))
#print(output_shape)

loss = mx.sym.contrib.ctc_loss(data=pred_ctc, label=label)
ctc_loss = mx.sym.MakeLoss(loss)

softmax_class = mx.symbol.SoftmaxActivation(data=pred, name = 'label')
softmax_loss = mx.sym.MakeLoss(softmax_class)
softmax_loss = mx.sym.BlockGrad(softmax_loss)
sm = mx.sym.Group([softmax_loss, ctc_loss])

return sm

import logging
#from mxnet_model import cnn_lstm_ctc

log_file_name = “CNN_LSTM_CNN_MXNet.log”
log_file = open(log_file_name, ‘w’)
log_file.close()
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)
fh = logging.FileHandler(log_file_name)
logger.addHandler(fh)
prefix = os.path.join(os.getcwd(), ‘model’, ‘lstm_ctc’)

seq_len = 32
num_label = 15
num_hidden = 200
num_classes = 13550
num_lstm_layer = 1
batch_size = 50

sym = cnn_lstm_ctc(num_lstm_layer, seq_len, num_hidden = num_hidden, num_classes = num_classes, num_label = num_label, dropout = 0)

#net_model = mx.mod.Module(symbol=sym, context=mx.gpu())
model = mx.model.FeedForward(symbol = sym, context = [mx.gpu(0)], num_epoch = 10, optimizer = ‘Adam’,
learning_rate = 0.01,
momentum = 0.9,
initializer=mx.init.Xavier(factor_type=“in”, magnitude=2.34))

head = ‘%(asctime)-15s %(message)s’
logging.basicConfig(level=logging.DEBUG, format=head)
logger.info(‘begin fit’)

model.fit(X = train_iter,
eval_data = val_iter,
eval_metric = mx.metric.np(Accuracy),
epoch_end_callback = mx.callback.do_checkpoint(prefix, period = 1),
batch_end_callback = mx.callback.Speedometer(batch_size, 50), logger = logger)


#4

Hi @ThomasDelteil, thank you for replying. I am using MXNet version 0.12.1. Here is the code:

pylint:skip-file

import mxnet as mx
import numpy as np
from collections import namedtuple
import time
import math
LSTMState = namedtuple(“LSTMState”, [“c”, “h”])
LSTMParam = namedtuple(“LSTMParam”, [“i2h_weight”, “i2h_bias”,
“h2h_weight”, “h2h_bias”])
LSTMModel = namedtuple(“LSTMModel”, [“rnn_exec”, “symbol”,
“init_states”, “last_states”, “forward_state”, “backward_state”,
“seq_data”, “seq_labels”, “seq_outputs”,
“param_blocks”])

def lstm(num_hidden, indata, prev_state, param, seqidx, layeridx, dropout=0.):
“”“LSTM Cell symbol”""
if dropout > 0.:
indata = mx.sym.Dropout(data=indata, p=dropout)
i2h = mx.sym.FullyConnected(data=indata,
weight=param.i2h_weight,
bias=param.i2h_bias,
num_hidden=num_hidden * 4,
name=“t%d_l%d_i2h” % (seqidx, layeridx))
h2h = mx.sym.FullyConnected(data=prev_state.h,
weight=param.h2h_weight,
bias=param.h2h_bias,
num_hidden=num_hidden * 4,
name=“t%d_l%d_h2h” % (seqidx, layeridx))
gates = i2h + h2h
slice_gates = mx.sym.SliceChannel(gates, num_outputs=4,
name=“t%d_l%d_slice” % (seqidx, layeridx))
in_gate = mx.sym.Activation(slice_gates[0], act_type=“sigmoid”)
in_transform = mx.sym.Activation(slice_gates[1], act_type=“tanh”)
forget_gate = mx.sym.Activation(slice_gates[2], act_type=“sigmoid”)
out_gate = mx.sym.Activation(slice_gates[3], act_type=“sigmoid”)
next_c = (forget_gate * prev_state.c) + (in_gate * in_transform)
next_h = out_gate * mx.sym.Activation(next_c, act_type=“tanh”)
return LSTMState(c=next_c, h=next_h)

def cnn_lstm_ctc(num_lstm_layer, seq_len, num_hidden, num_classes, num_label, dropout=0.):

last_states = []
forward_param = []
backward_param = []
for i in range(num_lstm_layer*2):
  last_states.append(LSTMState(c = mx.sym.Variable("l%d_init_c" % i), h = mx.sym.Variable("l%d_init_h" % i)))
  if i % 2 == 0:
    forward_param.append(LSTMParam(i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
                              i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
                              h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
                              h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)))
  else:
    backward_param.append(LSTMParam(i2h_weight=mx.sym.Variable("l%d_i2h_weight" % i),
                              i2h_bias=mx.sym.Variable("l%d_i2h_bias" % i),
                              h2h_weight=mx.sym.Variable("l%d_h2h_weight" % i),
                              h2h_bias=mx.sym.Variable("l%d_h2h_bias" % i)))

# input
data = mx.sym.Variable('data')
label = mx.sym.Variable('softmax_label')
data = mx.sym.L2Normalization(data)
#CNN model
# Layer 1
# first conv layer
conv1 = mx.sym.Convolution(data=data, kernel=(5,5), num_filter=16)
relu1 = mx.sym.Activation(data=conv1, act_type="relu")
#pool1 = mx.sym.Pooling(data=act1, pool_type="max", kernel=(2,2), stride=(2,2))
#d_out1 = mx.sym.Dropout(data=pool2, p=dropout)
# second conv layer
conv2 = mx.sym.Convolution(data=relu1, kernel=(5,5), num_filter=32)
relu2 = mx.sym.Activation(data=conv2, act_type="relu")
pool2 = mx.sym.Pooling(data=relu2, pool_type="max", kernel=(2,2), stride=(2,2))
l2_norm2 = mx.sym.L2Normalization(pool2)
d_out2 = mx.sym.Dropout(data=l2_norm2, p=dropout)
#third conv layer
conv3 = mx.sym.Convolution(data=d_out2, kernel=(5,5), num_filter=64)
relu3 = mx.sym.Activation(data=conv3, act_type="relu")
pool3 = mx.sym.Pooling(data=relu3, pool_type="max", kernel=(2,2), stride=(2,2))
#d_out3 = mx.sym.Dropout(data=pool3, p=dropout)
#fourth layer
conv4 = mx.sym.Convolution(data = pool3, kernel=(5,5), num_filter=128)
relu4 = mx.sym.Activation(data=conv4, act_type="relu")
pool4 = mx.sym.Pooling(data=relu4, pool_type="max", kernel=(2,2), stride=(2,2))
l2_norm4 = mx.sym.L2Normalization(pool4)
d_out4 = mx.sym.Dropout(data=l2_norm4, p=dropout)

arg_shape, output_shape, aux_shape = d_out4.infer_shape(data=(128, 1, 150, 150))
print(output_shape)

cnn_out = mx.sym.transpose(data=d_out4, axes=(0,3,1,2), name="cnn_out")
arg_shape, output_shape, aux_shape = cnn_out.infer_shape(data=(128,1,150,150))
print(output_shape)
flatten_out = mx.sym.Flatten(data=cnn_out, name="flatten_out")
arg_shape, output_shape, aux_shape = flatten_out.infer_shape(data=(128,1,150,150))
print(output_shape)
wordvec = mx.sym.split(data=flatten_out, num_outputs=seq_len, axis = 1)
arg_shape, output_shape, aux_shape = wordvec.infer_shape(data=(128,1,150,150))
print(output_shape)

forward_hidden = []
for seqidx in range(seq_len):
    hidden = wordvec[seqidx]
    for i in range(num_lstm_layer): 
        next_state = lstm(num_hidden, indata=hidden,
                        prev_state=last_states[2*i],
                        param=forward_param[i],
                        seqidx=seqidx, layeridx=0, dropout=dropout)
        hidden = next_state.h
        last_states[2*i] = next_state
    forward_hidden.append(hidden)

backward_hidden = []
for seqidx in range(seq_len):
    k = seq_len - seqidx - 1
    hidden = wordvec[k]
    for i in range(num_lstm_layer):
        next_state = lstm(num_hidden, indata=hidden,
                        prev_state=last_states[2*i + 1],
                        param=backward_param[i],
                        seqidx=k, layeridx=1,dropout=dropout)
        hidden = next_state.h
        last_states[2*i + 1] = next_state
    backward_hidden.insert(0, hidden)

    
hidden_all = []
for i in range(seq_len):
    hidden_all.append(mx.sym.Concat(*[forward_hidden[i], backward_hidden[i]], dim=1))

hidden_concat = mx.sym.Concat(*hidden_all, dim=0)
pred = mx.sym.FullyConnected(data=hidden_concat, num_hidden=num_classes)
#arg_shape, output_shape, aux_shape = pred.infer_shape(data=(128,1,150,150))
#print(output_shape)

# WRAP CTC
#label = mx.sym.Reshape(data=label, shape=(-1,))
#label = mx.sym.Cast(data = label, dtype = 'int32')
#sm = mx.sym.WarpCTC(data=pred, label=label, label_length = num_label, input_length = seq_len)

# MXNet CTC
pred_ctc = mx.sym.Reshape(data=pred, shape=(num_classes, batch_size, seq_len))
#arg_shape, output_shape, aux_shape = pred_ctc.infer_shape(data=(128,1,150,150))
#print(output_shape)
#arg_shape, output_shape, aux_shape = label.infer_shape(data=(128,1,150,150))
#print(output_shape)

loss = mx.sym.contrib.ctc_loss(data=pred_ctc, label=label)
ctc_loss = mx.sym.MakeLoss(loss)

softmax_class = mx.symbol.SoftmaxActivation(data=pred, name = 'label')
softmax_loss = mx.sym.MakeLoss(softmax_class)
softmax_loss = mx.sym.BlockGrad(softmax_loss)
sm = mx.sym.Group([softmax_loss, ctc_loss])

return sm

import logging

log_file_name = “CNN_LSTM_CNN_MXNet.log”
log_file = open(log_file_name, ‘w’)
log_file.close()
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)
fh = logging.FileHandler(log_file_name)
logger.addHandler(fh)
prefix = os.path.join(os.getcwd(), ‘model’, ‘lstm_ctc’)

seq_len = 32
num_label = 15
num_hidden = 200
num_classes = 13550
num_lstm_layer = 1
batch_size = 50

sym = cnn_lstm_ctc(num_lstm_layer, seq_len, num_hidden = num_hidden, num_classes = num_classes, num_label = num_label, dropout = 0)

#net_model = mx.mod.Module(symbol=sym, context=mx.gpu())
model = mx.model.FeedForward(symbol = sym, context = [mx.gpu(0)], num_epoch = 10, optimizer = ‘Adam’,
learning_rate = 0.01,
momentum = 0.9,
initializer=mx.init.Xavier(factor_type=“in”, magnitude=2.34))

head = ‘%(asctime)-15s %(message)s’
logging.basicConfig(level=logging.DEBUG, format=head)
logger.info(‘begin fit’)

model.fit(X = train_iter,
eval_data = val_iter,
eval_metric = mx.metric.np(Accuracy),
epoch_end_callback = mx.callback.do_checkpoint(prefix, period = 1),
batch_end_callback = mx.callback.Speedometer(batch_size, 50), logger = logger)