When looking at `mxnet/optimizer/optimizer.py`

, I found the code for the Adam optimizer:

```
class Adam(Optimizer):
def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
lazy_update=True, **kwargs):
super(Adam, self).__init__(learning_rate=learning_rate, **kwargs)
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.lazy_update = lazy_update
def create_state(self, index, weight):
stype = weight.stype if self.lazy_update else 'default'
return (zeros(weight.shape, weight.context, dtype=weight.dtype,
stype=stype), # mean
zeros(weight.shape, weight.context, dtype=weight.dtype,
stype=stype)) # variance
def update(self, index, weight, grad, state):
assert(isinstance(weight, NDArray))
assert(isinstance(grad, NDArray))
self._update_count(index)
lr = self._get_lr(index)
wd = self._get_wd(index)
t = self._index_update_count[index]
coef1 = 1. - self.beta1**t
coef2 = 1. - self.beta2**t
lr *= math.sqrt(coef2)/coef1
kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
'rescale_grad': self.rescale_grad}
if self.clip_gradient:
kwargs['clip_gradient'] = self.clip_gradient
mean, var = state
adam_update(weight, grad, mean, var, out=weight,
lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
```

How would it be possible to change this in order to accumulate the gradients over several mini-batches before updating?