LSTM/GRU Output Shape Error

def make_dataset_many_to_one(df, seq_len, train_size, variable='Adj Close_MS'):
    
    val_size= ((1-train_size)/2)
    df= shift_exog(df, variable)
    training_days= int(train_size*df.shape[0])
    validation_days= int(val_size*df.shape[0])
    train_values= df.values[:training_days]
    val_values= df.values[training_days:-validation_days]
    test_values= df.values[-validation_days:]
    
    scaler= MinMaxScaler(feature_range=(0,1))
    train_values= scaler.fit_transform(train_values)
    val_values= scaler.transform(val_values)
    test_values= scaler.transform(test_values)
    
    #Scale data with MinMaxScaler(0,1) - Subtract training mean.
    
    scaler= MinMaxScaler(feature_range=(0,1))
    train_values= scaler.fit_transform(train_values)
    training_mean= np.mean(train_values)
    train_values= train_values-training_mean
    
    val_values= scaler.transform(val_values)
    val_values= val_values-training_mean
    
    test_values= scaler.transform(test_values)
    test_values= test_values-training_mean
    
    
    # create windows with length 'seq_len'
    
    WindowsTrain_x = []
    WindowsTrain_y = []

    for i in range(seq_len,len(train_values)):
        WindowsTrain_x.append(train_values[i-seq_len:i])
        WindowsTrain_y.append(train_values[i][-1])
    WindowsTrain_x = mxnet.nd.array(np.array(WindowsTrain_x))
    WindowsTrain_y = mxnet.nd.array(np.array(WindowsTrain_y).reshape(-1,1,1))
    
    
    
    WindowsVal_x= []
    WindowsVal_y= []
    
    for i in range(seq_len,len(val_values)):
        WindowsVal_x.append(val_values[i-seq_len:i])
        WindowsVal_y.append(val_values[i][-1])
    WindowsVal_x= mxnet.nd.array(np.array(WindowsVal_x))
    WindowsVal_y= mxnet.nd.array(np.array(WindowsVal_y).reshape(-1,1,1))

        
        
    WindowsTest_x= []
    WindowsTest_y= []
    
    for i in range(seq_len,len(test_values)):
        WindowsTest_x.append(test_values[i-seq_len:i])
        WindowsTest_y.append(test_values[i][-1])
    WindowsTest_x= mxnet.nd.array(np.array(WindowsTest_x))
    WindowsTest_y= mxnet.nd.array(np.array(WindowsTest_y).reshape(-1,1,1))
    
    print('Training Windows have shape: ', WindowsTrain_x.shape, WindowsTrain_y.shape, '\n')
    print('Validation Windows have shape: ', WindowsVal_x.shape, WindowsVal_y.shape, '\n')
    print('Testing Windows have shape: ', WindowsTest_x.shape, WindowsTest_y.shape, '\n')

    
    return WindowsTrain_x, WindowsTrain_y, WindowsVal_x, WindowsVal_y, WindowsTest_x, WindowsTest_y

class MxNetRNNModel(gluon.Block):
    def __init__(self, mode, num_embed, num_hidden, num_layers,  
                 bidirectional=False, **kwargs):
        super(MxNetRNNModel, self).__init__(**kwargs)
        self.num_hidden= num_hidden
        
        with self.name_scope():
            
            if mode == 'lstm':
                self.rnn= rnn.LSTM(num_hidden, num_layers, input_size=num_embed, 
                                  bidirectional=bidirectional, layout='NTC')
            elif mode == 'gru':
                self.rnn= rnn.GRU(num_hidden, num_layers, input_size=num_embed,
                                  bidirectional=bidirectional, layout='NTC')
                
            else:
                pass
            
            self.decoder= nn.Dense(1, in_units=self.num_hidden)
    
    def forward(self, inputs, hidden):
        output, hidden= self.rnn(inputs, hidden)
        output= output.reshape((-1, self.num_hidden))
        print(output.shape)
        decoded= self.decoder(output)
        return decoded, hidden
    
    def begin_state(self, *args, **kwargs):
        return self.rnn.begin_state(*args, **kwargs) 
    

def MxNetModel(df, mode, num_hidden, num_layers, learning_rate, opt):
    
    model = MxNetRNNModel(mode=mode, num_embed=df.shape[1], num_hidden=num_hidden, num_layers=num_layers)
    device = mxnet.gpu(0) if mxnet.context.num_gpus() > 0 else mxnet.cpu(0)
    model.initialize(mxnet.init.Xavier(), ctx=device)
    trainer = gluon.Trainer(params=model.collect_params(), optimizer=opt, optimizer_params={'learning_rate': learning_rate})
    loss_function = gluon.loss.L2Loss()
    print(model)
    
    return model, trainer, loss_function

def MxNetTrain(net, trainer, loss_function, WindowsTrain_x, WindowsTrain_y, WindowsVal_x, WindowsVal_y, WindowsTest_x, WindowsTest_y, batch_size=32, epochs=50):
    
    args_save = 'model.param'
    validation_L= []
    best_val = float("Inf")
    print_period = epochs // 5
    device = mxnet.gpu(0) if mxnet.context.num_gpus() > 0 else mxnet.cpu(0)
    
    # creating nd.arrayIter (training, validation and testing)
    train_iter = mxnet.io.NDArrayIter(WindowsTrain_x, WindowsTrain_y, batch_size = batch_size)
    val_iter = mxnet.io.NDArrayIter(WindowsVal_x, WindowsVal_y, batch_size = batch_size)
    test_iter = mxnet.io.NDArrayIter(WindowsTest_x, WindowsTest_y, batch_size = batch_size)
    hidden = model.begin_state(func=mx.nd.zeros, batch_size=batch_size, ctx=device)  

    total_loss=0.0 
    for epoch in tqdm_notebook(range(epochs), desc='epochs'):
        start_time= time.time()
        
        
        for trn_batch in train_iter:
            x = trn_batch.data[0].as_in_context(device)
            y = trn_batch.label[0].as_in_context(device)
            print(x.shape, y.shape)
            #hidden= detach(hidden)
            with autograd.record():
                output, hidden= net(x, hidden)
                L= loss_function(output,y)
                L.backward()
                
            grads = [i.grad(device) for i in net.collect_params().values()]
            # Here gradient is for the whole batch.
            # So we multiply max_norm by batch_size and bptt size to balance it.
            gluon.utils.clip_global_norm(grads, batch_size)
            trainer.step(batch_size,ignore_stale_grad=False)
            total_loss += mx.nd.sum(L).asscalar()

WindowsTrain_x, WindowsTrain_y, WindowsVal_x, WindowsVal_y, WindowsTest_x, WindowsTest_y= make_dataset_many_to_one(df, seq_len=20, train_size=0.66, variable='Adj Close_MS')
net, trainer, loss_function= MxNetModel(df, mode='lstm', num_hidden=20, num_layers=2, learning_rate=0.005, opt='adam')
MxNetTrain(net, trainer, loss_function, WindowsTrain_x, WindowsTrain_y, WindowsVal_x, WindowsVal_y, WindowsTest_x, WindowsTest_y, batch_size=32, epochs=500)

I get this error:

I do not understand why the shape of the output is different from the shape of y’s batch.