Mxnet Crashed when entering backward at the second time

#1

my code

with mx.autograd.record():
    cls_maps = []
    box_maps = []
    for x in data:
        cls_map, box_map = net(x)
        cls_maps.append(cls_map)
        box_maps.append(box_map)
    sum_losses, cls_loss, loc_loss, true_num, pos_num, true_pos = xloss(cls_maps, box_maps, targets)
    print(sum_losses)
    mx.autograd.backward(sum_losses)
    print('xx')

breaktrace

[2019-04-08 01:23:48] Namespace(batch_size=128, cls_weight=1.0, epochs=32, gpus='2', init='xavier', loc_loss='smoothl1', loc_weight=1.0, log_interval=1000, lr=0.01, lr_decay=0.1, lr_decay_epoch='14,28', momentum=0.9, neg_thresh=0.2, network='xpnet', num_workers=16, ohem_ratio=3, part_thresh=0.4, pos_thresh=0.5, resume='', save_interval=1, save_prefix='models/mtcnn/xpnet/', seed=233, start_epoch=0, val_interval=1, wd=0.0005)
[2019-04-08 01:23:55] Start training from [Epoch 0]
[
[0.01407763 0.01240903 0.01362846 0.01556144 0.01254101 0.01595419
 0.01081374 0.0139483  0.0147873  0.01587322 0.01383554 0.01187025
 0.01350198 0.01445905 0.01490223 0.01433355 0.01500035 0.08944204
 0.01241579 0.01383635 0.01656276 0.01513614 0.01387173 0.01458692
 0.02033376 0.01495883 0.01450217 0.01132379 0.01568248 0.01138312
 0.01673025 0.01568954 0.00999019 0.01531774 0.01330752 0.01558442
 0.01273274 0.07829046 0.01691306 0.01506827 0.01218625 0.0159149
 0.01468375 0.06446139 0.05079298 0.09157926 0.03881067 0.06517157
 0.03859201 0.07447457 0.0217255  0.03802549 0.06723116 0.08343804
 0.06224171 0.01801509 0.01381754 0.08061062 0.05628756 0.05798602
 0.07322764 0.05042768 0.02382251 0.01122834 0.06033935 0.08901118
 0.06157441 0.0769331  0.05577362 0.05515752 0.04530125 0.01515508
 0.01267787 0.01411963 0.0649811  0.04943382 0.04156324 0.01530558
 0.01330901 0.01530274 0.03853543 0.05781655 0.0669197  0.01397216
 0.03356314 0.05775301 0.0578515  0.05734032 0.05458724 0.07108882
 0.09064264 0.05613307 0.02392583 0.01676747 0.0373242  0.04070126
 0.06609067 0.07135201 0.06576351 0.07928603 0.0377898  0.05343663
 0.05364268 0.04078859 0.032362   0.0595645  0.04155075 0.04532195
 0.01592593 0.06205503 0.07083612 0.04053818 0.0436005  0.05752687
 0.06839994 0.04348036 0.08063961 0.01369315 0.01211654 0.02235535
 0.01474376 0.05575649 0.04824112 0.0412776  0.05023791 0.04675512
 0.06616293 0.03535084]
<NDArray 128 @cpu(0)>]
xx
[
[0.0461057  0.04311114 0.01007549 0.01019483 0.01261313 0.01052247
 0.05715363 0.00993868 0.03236393 0.05793261 0.05574871 0.05383644
 0.04205071 0.01026835 0.00972549 0.05104025 0.01095377 0.03886776
 0.01817816 0.00758897 0.01314092 0.05007691 0.0444857  0.03471864
 0.05491452 0.04055006 0.04437283 0.03874899 0.01171677 0.04040579
 0.03832798 0.04198241 0.06802291 0.0101934  0.04010667 0.03530842
 0.00968803 0.01118376 0.03207314 0.06906639 0.0494512  0.0266571
 0.0426003  0.00863507 0.06837383 0.04775794 0.04194265 0.05360783
 0.06159041 0.05396234 0.03559795 0.04479052 0.01131241 0.00917019
 0.03773171 0.04837854 0.06176953 0.06986631 0.0309676  0.07272735
 0.01217951 0.04873266 0.0349315  0.05589979 0.07066163 0.03415564
 0.06378014 0.0483586  0.04744243 0.01053689 0.05461641 0.01024267
 0.01156786 0.03343847 0.04400596 0.03273573 0.01068421 0.04600831
 0.04017534 0.04249977 0.06326544 0.04578679 0.03339545 0.03330716
 0.04672096 0.0529854  0.00901068 0.03772534 0.01116634 0.06090269
 0.08600384 0.01246181 0.02962612 0.00896593 0.0335142  0.06837423
 0.04686296 0.03302075 0.01268331 0.00996092 0.01426105 0.04551787
 0.00825613 0.03391191 0.05838149 0.00877391 0.04538741 0.03334597
 0.04608255 0.08213078 0.04248736 0.0543548  0.01057027 0.05453919
 0.05084716 0.01079561 0.06019    0.00944663 0.06341526 0.05255142
 0.04612044 0.00792102 0.04052918 0.04942035 0.04448518 0.01060844
 0.00924082 0.03040769]
<NDArray 128 @cpu(0)>]

Segmentation fault: 11

Stack trace returned 10 entries:
[bt] (0) /home/yf/.software/anaconda3/envs/py36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x447b5a) [0x7fc3b7b34b5a]
[bt] (1) /home/yf/.software/anaconda3/envs/py36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x36e2d46) [0x7fc3badcfd46]
[bt] (2) /lib/x86_64-linux-gnu/libc.so.6(+0x354b0) [0x7fc43d3034b0]
[bt] (3) /home/yf/.software/anaconda3/envs/py36/lib/python3.6/site-packages/mxnet/libmxnet.so(mxnet::imperative::SetDependency(nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> >*, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> >*, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> >*, std::vector<unsigned int, std::allocator<unsigned int> >*, mxnet::DispatchMode)+0x271) [0x7fc3ba5c3bd1]
[bt] (4) /home/yf/.software/anaconda3/envs/py36/lib/python3.6/site-packages/mxnet/libmxnet.so(mxnet::Imperative::InvokeOp(mxnet::Context const&, nnvm::NodeAttrs const&,std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, mxnet::DispatchMode, mxnet::OpStatePtr)+0x18a) [0x7fc3ba5c508a]
[bt] (5) /home/yf/.software/anaconda3/envs/py36/lib/python3.6/site-packages/mxnet/libmxnet.so(+0x2ee2ef0) [0x7fc3ba5cfef0]
[bt] (6) /home/yf/.software/anaconda3/envs/py36/lib/python3.6/site-packages/mxnet/libmxnet.so(mxnet::imperative::InvokeOperator(nnvm::IndexedGraph const&, int, bool, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> >, mxnet::Context, std::vector<mxnet::OpStatePtr, std::allocator<mxnet::OpStatePtr> >*, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> >, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> >, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> >*, std::vector<unsigned int, std::allocator<unsigned int> >*, std::function<void (mxnet::OpStatePtr const&)>)+0x565) [0x7fc3ba5d0795]
[bt] (7) /home/yf/.software/anaconda3/envs/py36/lib/python3.6/site-packages/mxnet/libmxnet.so(mxnet::imperative::RunGraph(bool, nnvm::IndexedGraph const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> >, unsigned long, unsigned long, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> >&&, std::vector<unsignedint, std::allocator<unsigned int> >&&, std::vector<mxnet::OpStatePtr, std::allocator<mxnet::OpStatePtr> >*, std::vector<mxnet::DispatchMode, std::allocator<mxnet::DispatchMode> > const&, bool, std::vector<mxnet::TShape, std::allocator<mxnet::TShape> >*)+0x489) [0x7fc3ba5d1a69]
[bt] (8) /home/yf/.software/anaconda3/envs/py36/lib/python3.6/site-packages/mxnet/libmxnet.so(mxnet::Imperative::Backward(std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, bool, bool, bool)+0x3ab6) [0x7fc3ba5cd116]
[bt] (9) /home/yf/.software/anaconda3/envs/py36/lib/python3.6/site-packages/mxnet/libmxnet.so(MXAutogradBackwardEx+0x573) [0x7fc3ba4bbb73]

Hope to get your help.

#2

Try using sum_losses.backward() instead of “mx.autograd.backward(sum_losses)”

And welcome to the community.

1 Like
#3

Thank you for your advise. After changing it to loss.backward, it crashed samely.

for loss in sum_losses:
   loss.backward()
#4

Can you elaborate how you are calculating “sum_losses”?

#5

Here is the minimum code to reproducible.

import mxnet as mx
from mxnet import nd
from mxnet import gluon
from mxnet.gluon import nn

class OhemLoss(gluon.Block):
    def __init__(self, pos_thresh=0.5, neg_thresh=0.3, ohem_ratio=3, **kwargs):
        super(OhemLoss, self).__init__(**kwargs)
        self._pos_thresh = pos_thresh
        self._neg_thresh = neg_thresh
        self._ohem_ratio = ohem_ratio
    
    def forward(self, cls_map, targets):
        pred = cls_map.reshape(0,2,-1).transpose((0,2,1))
        ious = targets[:,:,0]
        pos_mask = ious >= self._pos_thresh
        neg_mask = ious < self._neg_thresh
        pos_num = nd.sum(pos_mask).asscalar()
        neg_num = nd.sum(neg_mask).asscalar()
        min_neg_num = min(pos_num * self._ohem_ratio, pos_mask.size - pos_num)
        loss = nd.log_softmax(pred)
        loss = -nd.pick(loss, pos_mask, axis=2, keepdims=False)
        if neg_num < min_neg_num:
            # mask out positive samples
            neg_loss = nd.where(pos_mask, nd.zeros_like(pos_mask), loss)
            argmaxs = neg_loss.reshape(-1,).argsort(is_ascend=False)
            neg_mask = argmaxs.argsort().reshape_like(neg_mask) < min_neg_num
            print(neg_mask)
            cls_mask = nd.logical_or(pos_mask, neg_mask)
            print(cls_mask)
            loss = nd.where(cls_mask, loss, nd.zeros_like(loss))
        return loss

class Network(gluon.HybridBlock):
    def __init__(self, **kwargs):
        super(Network, self).__init__(**kwargs)
        self.conv1 = nn.Conv2D(4, 1, prefix='conv1_')
        self.conv2 = nn.Conv2D(8, 1, prefix='conv2_')

    def hybrid_forward(self, F, x):
        return self.conv1(x), self.conv2(x)

if __name__ == '__main__':
    mx.random.seed(0)
    ctx = mx.cpu()
    net = Network()
    initializer = mx.init.Xavier(factor_type='in', rnd_type='gaussian', magnitude=2)
    net.initialize(init=initializer, ctx=ctx)
    net.hybridize()

    batch_size = 1
    data = mx.nd.random.randn(batch_size, 3, 1, 1).as_in_context(ctx)
    targets = mx.nd.random.uniform(0, 1, (batch_size, 2, 2))
    oloss = OhemLoss()

    for _ in range(10):
        print(data.shape, targets.shape)
        with mx.autograd.record():
            cls_map, box_map = net(data)
            mx.autograd.backward(oloss(cls_map, targets))
            print('run batch')

When I change the random seed or change targets shape to (batch_size, 2, 1), it works fine.

#6

The shape of cls_map must be equal to shape of targets. So by changing targets shape to (batch_size, 2, 1) it should resolve your problem.
But I don’t understand why changing random seed solves your problem? Seed has nothing to do with program, it just sets the random initializer value.
Sorry I can’t tell you about that.

#7
import mxnet as mx
from mxnet import nd
from mxnet import gluon
from mxnet.gluon import nn

class OhemLoss(gluon.Block):
    def __init__(self, pos_thresh=0.5, neg_thresh=0.3, ohem_ratio=3, **kwargs):
        super(OhemLoss, self).__init__(**kwargs)
        self._pos_thresh = pos_thresh
        self._neg_thresh = neg_thresh
        self._ohem_ratio = ohem_ratio
    
    def forward(self, cls_map, ious):
        pred = cls_map.reshape(0,2,-1).transpose((0,2,1))
        pos_mask = ious >= self._pos_thresh
        neg_mask = ious < self._neg_thresh
        pos_num = nd.sum(pos_mask).asscalar()
        neg_num = nd.sum(neg_mask).asscalar()
        min_neg_num = min(pos_num * self._ohem_ratio, pos_mask.size - pos_num)
        loss = nd.log_softmax(pred)
        loss = -nd.pick(loss, pos_mask, axis=2, keepdims=False)
        if neg_num < min_neg_num:
            # mask out positive samples
            neg_loss = nd.where(pos_mask, nd.zeros_like(pos_mask), loss)
            argmaxs = neg_loss.reshape(-1,).argsort(is_ascend=False)
            neg_mask = argmaxs.argsort().reshape_like(neg_mask) < min_neg_num
            cls_mask = nd.logical_or(pos_mask, neg_mask)
            print(pos_mask, neg_mask, cls_mask)
            loss = nd.where(cls_mask, loss, nd.zeros_like(loss))
        return loss

class Network(gluon.HybridBlock):
    def __init__(self, **kwargs):
        super(Network, self).__init__(**kwargs)
        self.conv1 = nn.Conv2D(4, 1, prefix='conv1_')
        self.conv2 = nn.Conv2D(8, 1, prefix='conv2_')

    def hybrid_forward(self, F, x):
        return self.conv1(x), self.conv2(x)

if __name__ == '__main__':
    ctx = mx.cpu()
    net = Network()
    initializer = mx.init.Xavier(factor_type='in', rnd_type='gaussian', magnitude=2)
    net.initialize(init=initializer, ctx=ctx)
    net.hybridize()

    batch_size = 1
    data = mx.nd.zeros((batch_size, 3, 1, 1)).as_in_context(ctx)
    ious = mx.nd.array([[0.5448832, 0.4236548]])
    oloss = OhemLoss()

    for _ in range(10):
        print(data.shape, ious.shape)
        with mx.autograd.record():
            cls_map, box_map = net(data)
            mx.autograd.backward(oloss(cls_map, ious))
            print('run batch')

To make it simple, I delete targets and random and make variables fixed, which crashed.

Previous version, targets was sliced to ious in loss, and ious shape is consistent with cls_pred.

#8

This is really simplified. :thinking:

from mxnet import nd, autograd

ious = nd.array([[0.6, 0.4]])
pred = nd.zeros((1,2,2))
pred.attach_grad()

with autograd.record():
    pos_mask = ious >= 0.5
    loss = nd.log_softmax(pred)
    loss = -nd.pick(loss, pos_mask, axis=2, keepdims=False)
    neg_loss = nd.where(pos_mask, nd.zeros_like(pos_mask), loss)
    argmaxs = neg_loss.reshape(-1,).argsort(is_ascend=False)
    neg_mask = argmaxs.argsort().reshape_like(pos_mask) < 1
    cls_mask = nd.logical_or(pos_mask, neg_mask)
    loss = nd.where(cls_mask, loss, nd.zeros_like(loss))
loss.backward()

Thank you for advice!

#9

I bypass this problem and I don’t know why.

from mxnet import nd, autograd

ious = nd.array([[0.6, 0.4]])
pred = nd.zeros((1,2,2))
pred.attach_grad()

with autograd.record():
    ious_ = ious.reshape(-1,)
    pred_ = pred.reshape(-1, 2)
    pos_mask = ious_ >= 0.5
    loss = nd.log_softmax(pred_)
    loss = -nd.pick(loss, pos_mask, axis=2, keepdims=False)
    neg_loss = nd.where(pos_mask, nd.zeros_like(pos_mask), loss)
    argmaxs = neg_loss..argsort(is_ascend=False)
    neg_mask = argmaxs.argsort() < 1
    cls_mask = nd.logical_or(pos_mask, neg_mask)
    loss = nd.where(cls_mask, loss, nd.zeros_like(loss))
    loss = loss.reshape_like(ious)
loss.backward()
#10

if you compute the backward operation outside the record scope, it doesn’t record it for higher order computation, otherwise it does. If you are not explicitely trying to get 2nd order gradients, you should always call .backward() outside the .record() scope.

1 Like
#11

It’s a really useful advice. Thank you :+1: