MxNet Actor-Critic Model: the weights of the actor & critic are not updated

I have the following simple actor-critic (reinforcement learning) classes:

class actorNN(gluon.HybridBlock):
    def __init__(self, **kwargs):
        super(actorNN, self).__init__(**kwargs)
        with self.name_scope():
            self.fc1 = gluon.nn.Dense(8)
            self.fc2 = gluon.nn.Dense(2)

    def hybrid_forward(self, f, x):
        #print('type(x): {}, f: {}'.format(type(x).__name__, f.__name__))
        x = f.relu(self.fc1(x))
        return f.softmax(self.fc2(x))

class criticNN(gluon.HybridBlock):
    def __init__(self, **kwargs):
        super(criticNN, self).__init__(**kwargs)
        with self.name_scope():
            self.fc1 = gluon.nn.Dense(10)
            self.fc2 = gluon.nn.Dense(1)

    def hybrid_forward(self, f, x):
        #print('type(x): {}, f: {}'.format(type(x).__name__, f.__name__))
        x = f.relu(self.fc1(x))
        return f.relu(self.fc2(x))

and I am initialising them as

self.__actor = actorNN()
self.__actor.collect_params().initialize(init = mx.init.Xavier(), ctx=self.ctx)#, #mx.init.Normal(sigma=1.)
self.__actorTrainer = gluon.Trainer(self.__actor.collect_params(), 'sgd', {'learning_rate': lr})
#self.__actor.hybridize()

self.__critic = criticNN()
self.__critic.collect_params().initialize(init = mx.init.Xavier(), ctx=self.ctx)#, #mx.init.Normal(sigma=1.)        
self.__criticTrainer = gluon.Trainer(self.__critic.collect_params(), 'sgd', {'learning_rate': lr})
#self.__critic.hybridize()

# temp
self.actor = self.__actor
self.critic = self.__critic
self.actorTrainer = self.__criticTrainer
self.criticTrainer = self.__criticTrainer
self.criticLossFn = gluon.loss.L1Loss()

Then training them with (all the variables are on a ndarray)

def learn(self):
        rewards = self.rewardBuffer[np.nonzero(self.rewardBuffer)[0]]
        rewards = rewards.reshape((rewards.shape[0], 1))
        #values = self.valueBuffer[mx_np.nonzero(self.valueBuffer)].as_np_ndarray()
        batch_size = len(rewards)
        nonZeroInds = np.nonzero(self.observationBuffer)
        obs = self.observationBuffer[np.unique(nonZeroInds[0])]
        actionTaken = self.actionBuffer[np.nonzero(self.actionBuffer)[0]]
        actionTaken = actionTaken.reshape((actionTaken.shape[0], 1))
        actionTaken[np.where(np.array(actionTaken) == -1)[0]] = 0
        criticInput = mx.nd.concatenate([obs, actionTaken], axis = 1)
        with autograd.record():
            probs = self.actor(obs)
            values = self.critic(criticInput)
            discountedRewards = self.discountTheRewards(rewards, values, batch_size)
            actorLoss = (-1)*mx.nd.dot(mx.nd.pick(data = probs, index = actionTaken, axis = 1).log(),discountedRewards)
        #self.actor.collect_params().zero_grad()
        actorLoss.backward()
        #autograd.backward(actorLoss)
        self.actorTrainer.set_learning_rate(lr)
        self.actorTrainer.step(batch_size)

        with autograd.record():
            values = self.critic(criticInput)
            criticLoss = self.criticLossFn(values, discountedRewards)
        #self.critic.collect_params().zero_grad()
        criticLoss.backward()
        self.criticTrainer.set_learning_rate(lr)
        self.criticTrainer.step(batch_size)

I print the weights & biases of both actor and critic after the end of each episode, but only a handful of weights of critic’s NN are updated slightly. If I hybridize both actor and critic, and try to train them like that, then I also lost that slight update, and both actor & critic don’t learn anything.

Question:

What am I doing wrong?


In case needed, this is my discountTheRewards method:

def discountTheRewards(self, rewards, values, batch_size):
        if rewards.shape != values.shape:
            raise Exception("incompatible rewards-values data!")
        
        discountedRewards = mx.nd.zeros((batch_size,1), dtype = 'float32')
        for t in range(1,batch_size):
            G = 0
            for k in range(t, batch_size):
                G = G + mx.nd.power(discountFactor,k-t) * rewards[k]
            discountedRewards[t] = G
        return (values - discountedRewards)