LSTM shape error


Hi all,

A rather dumb question but I can’t seem to get the shape right again hah. Let’s say I have time series data with 4 columns (3 features, and want to predict the 4th column)

dataset = np.random.uniform(0, 1, size=(3000, 4))
from multiprocessing import cpu_count
CPU_COUNT = cpu_count()
X = dataset[:, 0:-1]; y = dataset[:, -1]
dataset_f =, y)
data_loader =, batch_size=batch_size, num_workers=CPU_COUNT)

class RNNModel(gluon.Block):
    def __init__(self, num_embed, num_hidden, num_layers, bidirectional=False, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        with self.name_scope():
            self.rnn = rnn.LSTM(num_hidden, num_layers, input_size=num_embed, \
                                bidirectional=bidirectional, layout='TNC')
            self.decoder = nn.Dense(1, in_units=num_hidden)

    def forward(self, inputs, hidden):
        output, hidden = self.rnn(inputs, hidden)
        decoded = self.decoder(output.reshape((-1, self.num_hidden)))
        return decoded, hidden

    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs)

model = RNNModel(num_embed=X.shape[1], num_hidden=500, num_layers=1)
model.collect_params().initialize(mx.init.Xavier(), ctx=context)
trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': .01})
loss = gluon.loss.L1Loss()

def train():
    for epoch in range(1):
        total_L = 0.0
        start_time = time.time()
        ibatch = 0
        hidden = model.begin_state(func = mx.nd.zeros, batch_size = batch_size, ctx = context)
        for X_batch, y_batch in data_loader:
            #hidden = detach(hidden)
            with autograd.record():
                output, hidden = model(X_batch, hidden)
                L = loss(output, y_batch)
            grads = [i.grad(context) for i in model.collect_params().values()]

            gluon.utils.clip_global_norm(grads, args_clip * args_bptt * batch_size)

            total_L += mx.nd.sum(L).asscalar()
            ibatch += 1
            if ibatch % 10 == 0 and ibatch > 0:
                cur_L = total_L / args_bptt / batch_size / args_log_interval
                print('[Epoch %d Batch %d] loss %.2f, perplexity %.2f' % (
                    epoch + 1, ibatch, cur_L, math.exp(cur_L)))
                total_L = 0.0


When I run it I get this:
ValueError: Invalid recurrent state shape. Expecting (1, 3, 500), got (1, 64, 500).

I know the shape must be (sequence_length, batch_size, input_size). I just can’t understand what exactly is expecting this shape. Is it the weights matrices or something else?


(PS. Tom, Sergey, sorry for the stupid question)


There seems to be a couple of issues: first of all you define the layout as TNC but your input is in the form of (batch_size, features). You either need to reshape the input data or change the layout to NTC. Another problem is the input data format: as you mentioned the shape must be (sequence_length, batch_size, input_size), but in your code the data has only 2 dimensions.