I have the following simple actor-critic (reinforcement learning) classes:
class actorNN(gluon.HybridBlock):
def __init__(self, **kwargs):
super(actorNN, self).__init__(**kwargs)
with self.name_scope():
self.fc1 = gluon.nn.Dense(8)
self.fc2 = gluon.nn.Dense(2)
def hybrid_forward(self, f, x):
#print('type(x): {}, f: {}'.format(type(x).__name__, f.__name__))
x = f.relu(self.fc1(x))
return f.softmax(self.fc2(x))
class criticNN(gluon.HybridBlock):
def __init__(self, **kwargs):
super(criticNN, self).__init__(**kwargs)
with self.name_scope():
self.fc1 = gluon.nn.Dense(10)
self.fc2 = gluon.nn.Dense(1)
def hybrid_forward(self, f, x):
#print('type(x): {}, f: {}'.format(type(x).__name__, f.__name__))
x = f.relu(self.fc1(x))
return f.relu(self.fc2(x))
and I am initialising them as
self.__actor = actorNN()
self.__actor.collect_params().initialize(init = mx.init.Xavier(), ctx=self.ctx)#, #mx.init.Normal(sigma=1.)
self.__actorTrainer = gluon.Trainer(self.__actor.collect_params(), 'sgd', {'learning_rate': lr})
#self.__actor.hybridize()
self.__critic = criticNN()
self.__critic.collect_params().initialize(init = mx.init.Xavier(), ctx=self.ctx)#, #mx.init.Normal(sigma=1.)
self.__criticTrainer = gluon.Trainer(self.__critic.collect_params(), 'sgd', {'learning_rate': lr})
#self.__critic.hybridize()
# temp
self.actor = self.__actor
self.critic = self.__critic
self.actorTrainer = self.__criticTrainer
self.criticTrainer = self.__criticTrainer
self.criticLossFn = gluon.loss.L1Loss()
Then training them with (all the variables are on a ndarray)
def learn(self):
rewards = self.rewardBuffer[np.nonzero(self.rewardBuffer)[0]]
rewards = rewards.reshape((rewards.shape[0], 1))
#values = self.valueBuffer[mx_np.nonzero(self.valueBuffer)].as_np_ndarray()
batch_size = len(rewards)
nonZeroInds = np.nonzero(self.observationBuffer)
obs = self.observationBuffer[np.unique(nonZeroInds[0])]
actionTaken = self.actionBuffer[np.nonzero(self.actionBuffer)[0]]
actionTaken = actionTaken.reshape((actionTaken.shape[0], 1))
actionTaken[np.where(np.array(actionTaken) == -1)[0]] = 0
criticInput = mx.nd.concatenate([obs, actionTaken], axis = 1)
with autograd.record():
probs = self.actor(obs)
values = self.critic(criticInput)
discountedRewards = self.discountTheRewards(rewards, values, batch_size)
actorLoss = (-1)*mx.nd.dot(mx.nd.pick(data = probs, index = actionTaken, axis = 1).log(),discountedRewards)
#self.actor.collect_params().zero_grad()
actorLoss.backward()
#autograd.backward(actorLoss)
self.actorTrainer.set_learning_rate(lr)
self.actorTrainer.step(batch_size)
with autograd.record():
values = self.critic(criticInput)
criticLoss = self.criticLossFn(values, discountedRewards)
#self.critic.collect_params().zero_grad()
criticLoss.backward()
self.criticTrainer.set_learning_rate(lr)
self.criticTrainer.step(batch_size)
I print the weights & biases of both actor and critic after the end of each episode, but only a handful of weights of critic’s NN are updated slightly. If I hybridize both actor and critic, and try to train them like that, then I also lost that slight update, and both actor & critic don’t learn anything.
Question:
What am I doing wrong?
In case needed, this is my discountTheRewards
method:
def discountTheRewards(self, rewards, values, batch_size):
if rewards.shape != values.shape:
raise Exception("incompatible rewards-values data!")
discountedRewards = mx.nd.zeros((batch_size,1), dtype = 'float32')
for t in range(1,batch_size):
G = 0
for k in range(t, batch_size):
G = G + mx.nd.power(discountFactor,k-t) * rewards[k]
discountedRewards[t] = G
return (values - discountedRewards)