Mxnet how to concat the input data and output features to the train_iter

i need to create a network using mxnet, which i need the train_iter to includ the input original data and output feature of middle layer . the input original data is MNIST, the middle layer is RELU.
what i mean is that i let the output feature also to be part of the “input” trian data.

Hi @mxnetwqs,

You have complete control over your network architecture, so you can take the array (or symbol) representing the input data and the array (or symbol) representing the feature map of interest and pass these both to another layer for processing. I don’t know how you are going to be using these two inputs, but one method (if they are of the same spatial dimensions) is to use the concatenate operation to stack the array (or symbols) depthwise.

An example in Gluon would look something like:

import mxnet as mx
from mxnet import gluon, nd
from mxnet.gluon import nn

class Net(gluon.Block):
    def __init__(self, **kwargs):
        super(Net, self).__init__(**kwargs)
        with self.name_scope():
            self.conv1 = nn.Conv2D(8, kernel_size=3, padding=1)
            self.conv2 = nn.Conv2D(16, kernel_size=3, padding=1)
            self.fc1 = nn.Dense(10)

    def forward(self, data):
        conv1_out = nd.relu(self.conv1(data))
        conv2_out = nd.relu(self.conv2(conv1_out))
        # work with input data AND feature map from here onwards
        concat = nd.concat(data, conv2_out)
        output = self.fc1(concat)
        return output
    
net = Net()
net.initialize()
data = nd.random.normal(shape=(1,3,32,32))
out = net(data)

I hope that helps, Cheers, Thom

Do you know how to hybridize your Net with concat in the forward?

Hi @chrisluu, you need to make your network a HybridBlock

import mxnet as mx
from mxnet import gluon, nd
from mxnet.gluon import nn

class Net(gluon.HybridBlock):
    def __init__(self, **kwargs):
        super(Net, self).__init__(**kwargs)
        with self.name_scope():
            self.conv1 = nn.Conv2D(8, kernel_size=3, padding=1)
            self.conv2 = nn.Conv2D(16, kernel_size=3, padding=1)
            self.fc1 = nn.Dense(10)

    def hybrid_forward(self, F, data):
        conv1_out = F.relu(self.conv1(data))
        conv2_out = F.relu(self.conv2(conv1_out))
        # work with input data AND feature map from here onwards
        concat = F.concat(data, conv2_out)
        output = self.fc1(concat)
        return output

net = Net()
net.initialize()
net.hybridize()
data = nd.random.normal(shape=(1,3,32,32))
out = net(data)

Hi @ThomasDelteil, Thx for ur reply. Actually, I want to concat outputs of GlobalAvgPool and GlobalMaxPool, here is my code, I found that it cannot backward.

class BasicBlockV1b(nn.HybridBlock):
    def __init__(self, num_channels, use_1x1conv=False, strides=1, num_gpus=4, **kwargs):
        super(BasicBlockV1b, self).__init__(**kwargs)
        self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1, strides=strides, use_bias=False)
        self.bn1 = contrib.nn.SyncBatchNorm(num_devices=num_gpus)
        self.relu = nn.Activation('relu')
        self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1, use_bias=False)
        self.bn2 = contrib.nn.SyncBatchNorm(num_devices=num_gpus)
        if use_1x1conv:
            self.downsample = nn.HybridSequential()
            self.downsample.add(nn.Conv2D(num_channels, kernel_size=1, strides=strides, use_bias=False),
                                contrib.nn.SyncBatchNorm(num_devices=num_gpus))
        else:
            self.downsample = None

    def hybrid_forward(self, X):
        Y = self.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.downsample:
            X = self.downsample(X)
        return self.relu(Y + X)

def resnet_block(num_channels, num_residuals, first_block=False, num_gpus=4):
    blk = nn.HybridSequential()
    for i in range(num_residuals):
        if i == 0 and not first_block:
            blk.add(BasicBlockV1b(num_channels, use_1x1conv=True, strides=2, num_gpus=num_gpus))
        else:
            blk.add(BasicBlockV1b(num_channels, num_gpus=num_gpus))
    return blk

class ConcatLayer(nn.HybridBlock):
    """Concat operation for multiple inputs."""
    def __init__(self, **kwargs):
        super(ConcatLayer, self).__init__(**kwargs)
        self.gap = nn.GlobalAvgPool2D()
        self.gmp = nn.GlobalMaxPool2D()
        self.flat = nn.Flatten()

    def hybrid_forward(self, X):
        gap = self.flat(self.gap(X))
        gmp = self.flat(self.gmp(X))
        gap_norm = nd.L2Normalization(gap, mode='instance')
        gmp_norm = nd.L2Normalization(gmp, mode='instance')
        return nd.concat(gap_norm, gmp_norm, dim=1)

class ResNet34V1bSyncBN(nn.HybridSequential):
    def __init__(self, classes=28, num_gpus=4, **kwargs):
        super(ResNet34V1bSyncBN, self).__init__(**kwargs)
        self.conv1 = nn.Conv2D(64, kernel_size=7, strides=2, padding=3, use_bias=False)
        self.bn1 = contrib.nn.SyncBatchNorm(num_devices=num_gpus)
        self.relu = nn.Activation('relu')
        self.maxpool = nn.MaxPool2D(pool_size=3, strides=2, padding=1)
        self.layer1 = resnet_block(64, 3, first_block=True, num_gpus=num_gpus)
        self.layer2 = resnet_block(128, 4, num_gpus=num_gpus)
        self.layer3 = resnet_block(256, 6, num_gpus=num_gpus)
        self.layer4 = resnet_block(512, 3, num_gpus=num_gpus)
        self.concat = ConcatLayer()
        self.fc = nn.Dense(classes)

    def hybrid_forward(self, X, masks):
        Y = self.maxpool(self.relu(self.bn1(self.conv1(X))))
        Y = self.layer4(self.layer3(self.layer2(self.layer1(Y))))
        Y = self.fc(self.concat(Y * masks))
        return Y

Output architecture is:

net:  ResNet34V1bSyncBN(
  (conv1): Conv2D(None -> 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm108_', in_channels=None)
  (relu): Activation(relu)
  (maxpool): MaxPool2D(size=(3, 3), stride=(2, 2), padding=(1, 1), ceil_mode=False)
  (layer1): HybridSequential(
    (0): BasicBlockV1b(
      (conv1): Conv2D(None -> 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm109_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm110_', in_channels=None)
    )
    (1): BasicBlockV1b(
      (conv1): Conv2D(None -> 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm111_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm112_', in_channels=None)
    )
    (2): BasicBlockV1b(
      (conv1): Conv2D(None -> 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm113_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm114_', in_channels=None)
    )
  )
  (layer2): HybridSequential(
    (0): BasicBlockV1b(
      (conv1): Conv2D(None -> 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm115_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm116_', in_channels=None)
      (downsample): HybridSequential(
        (0): Conv2D(None -> 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm117_', in_channels=None)
      )
    )
    (1): BasicBlockV1b(
      (conv1): Conv2D(None -> 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm118_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm119_', in_channels=None)
    )
    (2): BasicBlockV1b(
      (conv1): Conv2D(None -> 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm120_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm121_', in_channels=None)
    )
    (3): BasicBlockV1b(
      (conv1): Conv2D(None -> 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm122_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm123_', in_channels=None)
    )
  )
  (layer3): HybridSequential(
    (0): BasicBlockV1b(
      (conv1): Conv2D(None -> 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm124_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm125_', in_channels=None)
      (downsample): HybridSequential(
        (0): Conv2D(None -> 256, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm126_', in_channels=None)
      )
    )
    (1): BasicBlockV1b(
      (conv1): Conv2D(None -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm127_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm128_', in_channels=None)
    )
    (2): BasicBlockV1b(
      (conv1): Conv2D(None -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm129_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm130_', in_channels=None)
    )
    (3): BasicBlockV1b(
      (conv1): Conv2D(None -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm131_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm132_', in_channels=None)
    )
    (4): BasicBlockV1b(
      (conv1): Conv2D(None -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm133_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm134_', in_channels=None)
    )
    (5): BasicBlockV1b(
      (conv1): Conv2D(None -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm135_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm136_', in_channels=None)
    )
  )
  (layer4): HybridSequential(
    (0): BasicBlockV1b(
      (conv1): Conv2D(None -> 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm137_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm138_', in_channels=None)
      (downsample): HybridSequential(
        (0): Conv2D(None -> 512, kernel_size=(1, 1), stride=(2, 2), bias=False)
        (1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm139_', in_channels=None)
      )
    )
    (1): BasicBlockV1b(
      (conv1): Conv2D(None -> 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm140_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm141_', in_channels=None)
    )
    (2): BasicBlockV1b(
      (conv1): Conv2D(None -> 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm142_', in_channels=None)
      (relu): Activation(relu)
      (conv2): Conv2D(None -> 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=1, key='syncbatchnorm143_', in_channels=None)
    )
  )
  (concat): ConcatLayer(
    (gap): GlobalAvgPool2D(size=(1, 1), stride=(1, 1), padding=(0, 0), ceil_mode=True)
    (gmp): GlobalMaxPool2D(size=(1, 1), stride=(1, 1), padding=(0, 0), ceil_mode=True)
    (flat): Flatten
  )
  (fc): Dense(None -> 28, linear)
)

Test code:

net = ResNet34V1bSyncBN()
# init
net.initialize(init.Xavier())
loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
with autograd.record():
    y_hat = net(data_array.as_in_context(ctx), mask_data_array.as_in_context(ctx))
    l = loss(y_hat, y)
l.backward()

Detail about input data:

data_array.dtype:  <class 'numpy.float32'>
data_array.shape:  (4, 4, 2048, 2048)
mask_data_array.dtype:  <class 'numpy.float32'>
mask_data_array.shape:  (4, 1, 64, 64)

And I got an error:

TypeError                                 Traceback (most recent call last)
<ipython-input-7-da82e6b42ae7> in <module>()
     26 loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
     27 with autograd.record():
---> 28     y_hat = net(data_array.as_in_context(ctx), mask_data_array.as_in_context(ctx))
     29     l = loss(y_hat, y)
     30 l.backward()

/home/anaconda3/lib/python3.6/site-packages/mxnet/gluon/block.py in __call__(self, *args)
    539             hook(self, args)
    540 
--> 541         out = self.forward(*args)
    542 
    543         for hook in self._forward_hooks.values():

/home/anaconda3/lib/python3.6/site-packages/mxnet/gluon/block.py in forward(self, x, *args)
    916                     params = {i: j.data(ctx) for i, j in self._reg_params.items()}
    917 
--> 918                 return self.hybrid_forward(ndarray, x, *args, **params)
    919 
    920         assert isinstance(x, Symbol), \

TypeError: hybrid_forward() takes 3 positional arguments but 4 were given

Hi @chrisluu

I fixed your code, you had a few issues. hybrid_forward signature is the following:
hybrid_forward(self, F, x) you forgot the F hence your first error. F is a function space, it can resolve to nd if non-hybridized or to sym if hybridized.

Rewriting your code I get:

import mxnet as mx
from mxnet import gluon
from mxnet.gluon import nn, contrib
from mxnet import init, autograd

class BasicBlockV1b(nn.HybridBlock):
    def __init__(self, num_channels, use_1x1conv=False, strides=1, num_gpus=4, **kwargs):
        super(BasicBlockV1b, self).__init__(**kwargs)
        with self.name_scope():
            self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1, strides=strides, use_bias=False)
            self.bn1 = contrib.nn.SyncBatchNorm(in_channels=num_channels, num_devices=num_gpus)
            self.relu = nn.Activation('relu')
            self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1, use_bias=False)
            self.bn2 = contrib.nn.SyncBatchNorm(num_devices=num_gpus)
            if use_1x1conv:
                self.downsample = nn.HybridSequential()
                self.downsample.add(nn.Conv2D(num_channels, kernel_size=1, strides=strides, use_bias=False),
                                    contrib.nn.SyncBatchNorm(num_devices=num_gpus))
            else:
                self.downsample = None

    def hybrid_forward(self, F, X):
        Y = self.relu(self.bn1(self.conv1(X)))
        Y = self.bn2(self.conv2(Y))
        if self.downsample:
            X = self.downsample(X)
        return self.relu(Y + X)

def resnet_block(num_channels, num_residuals, first_block=False, num_gpus=4):
    blk = nn.HybridSequential()
    with blk.name_scope():
        for i in range(num_residuals):
            if i == 0 and not first_block:
                blk.add(BasicBlockV1b(num_channels, use_1x1conv=True, strides=2, num_gpus=num_gpus))
            else:
                blk.add(BasicBlockV1b(num_channels, num_gpus=num_gpus))
        return blk

class ConcatLayer(nn.HybridBlock):
    """Concat operation for multiple inputs."""
    def __init__(self, **kwargs):
        super(ConcatLayer, self).__init__(**kwargs)
        with self.name_scope():
            self.gap = nn.GlobalAvgPool2D()
            self.gmp = nn.GlobalMaxPool2D()
            self.flat = nn.Flatten()

    def hybrid_forward(self, F, X):
        gap = self.flat(self.gap(X))
        gmp = self.flat(self.gmp(X))
        gap_norm = F.L2Normalization(gap, mode='instance')
        gmp_norm = F.L2Normalization(gmp, mode='instance')
        return F.concat(gap_norm, gmp_norm, dim=1)

class ResNet34V1bSyncBN(nn.HybridSequential):
    def __init__(self, classes=28, num_gpus=4, **kwargs):
        super(ResNet34V1bSyncBN, self).__init__(**kwargs)
        with self.name_scope():
            self.conv1 = nn.Conv2D(64, kernel_size=7, strides=2, padding=3, use_bias=False)
            self.bn1 = contrib.nn.SyncBatchNorm(in_channels=64, num_devices=num_gpus)
            self.relu = nn.Activation('relu')
            self.maxpool = nn.MaxPool2D(pool_size=3, strides=2, padding=1)
            self.layer1 = resnet_block(64, 3, first_block=True, num_gpus=num_gpus)
            self.layer2 = resnet_block(128, 4, num_gpus=num_gpus)
            self.layer3 = resnet_block(256, 6, num_gpus=num_gpus)
            self.layer4 = resnet_block(512, 3, num_gpus=num_gpus)
            self.concat = ConcatLayer()
            self.fc = nn.Dense(classes)

    def hybrid_forward(self, F, X, masks):
        Y = self.maxpool(self.relu(self.bn1(self.conv1(X))))
        Y = self.layer4(self.layer3(self.layer2(self.layer1(Y))))
        Y = self.fc(self.concat(F.broadcast_mul(Y, masks)))
        return Y

And you define initialize and run forward and backward like this:

# Create the network
net = ResNet34V1bSyncBN(num_gpus=1)
# init
ctx = mx.gpu()
y = mx.nd.ones((4,28), ctx=ctx)
data_array = mx.nd.ones((4,4,2048,2048), ctx=ctx)
mask_data_array = mx.nd.ones((4,1,64,64), ctx=ctx)

net.initialize(init.Xavier(), ctx)
net.hybridize()

loss = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
with autograd.record():
    y_hat = net(data_array.as_in_context(ctx), mask_data_array.as_in_context(ctx))
    l = loss(y_hat, y)
l.backward()
y_hat.asnumpy()[0]

Hi @ThomasDelteil
Thx for ur kindly help.Further more, how to write a correct loss function? I’m facing with the problem that if I change loss function with the one wrote by myself, it will not work properly for backward.

The code is as below:

class CustomizedLoss(gluon.loss.Loss):
    def __init__(self, gamma=2, **kwargs):
        super(CustomizedLoss, self).__init__(None, **kwargs)
        self._gamma = gamma

    def hybrid_forward(self, F, y_hat, y):
        epsilon = 1e-12
        rhs = F.max(-y_hat).asscalar()
        max_val = F.clip(-y_hat, a_min=0.0, a_max=rhs)
        loss = y_hat - y_hat * y + max_val + F.log(F.exp(-max_val) + F.exp(-y_hat - max_val) + epsilon)
        invprobs = F.log(F.sigmoid(-y_hat * (y * 2.0 - 1.0)) + epsilon)
        loss = F.exp(invprobs * self._gamma) * loss
        return loss

Do I need to implement the backward function?

It seems to be working fine for me with your loss.

loss = CustomizedLoss(batch_axis=0)

Though I am not sure what your loss is doing, by calling .asscalar() you lose the graph dependency. I would advise trying to remain in the F world without going to numpy or you might be surprised of the results you get, especially if you hybridize your loss, as the scalar value will be turned into constants.

Gotcha. @ThomasDelteil Thx for ur kindly help.

Hi @ThomasDelteil, I would like to ask another similiar question: How to concat pretrained weights for the first conv of ResNet50?
I tried ResNet34, the code is the following:

net = gluoncv.model_zoo.resnet34_v1b(pretrained=True)

w = net.conv1.weight.data()

with net.name_scope():
    net.conv1 = nn.Conv2D(in_channels=6, channels=64, kernel_size=7, strides=(2, 2), padding=(3, 3), use_bias=False)
    net.fc = nn.Dense(classes)

net.fc.initialize(init.Xavier(), force_reinit=True)
customized_init = mx.initializer.Constant(nd.Concat(w, w, dim=1))
net.conv1.weight.initialize(customized_init)

I want to do the same manipulation of ResNet50, but just found that I couldn’t set value for the first conv by the following code:

net.conv1[0] = nn.Conv2D(in_channels=6, channels=64, kernel_size=7, strides=(2, 2), padding=(3, 3), use_bias=False)

The network architectures are a little bit different between ResNet34 and ResNet50, the first conv of ResNet50 is a HybridSequential, whereas, Conv2D for ResNet34.

ResNet50V1dSyncBN(
  (conv1): HybridSequential(
    (0): Conv2D(3 -> 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=4, key='syncbatchnorm55_', in_channels=32)
    (2): Activation(relu)
    (3): Conv2D(32 -> 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (4): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=4, key='syncbatchnorm56_', in_channels=32)
    (5): Activation(relu)
    (6): Conv2D(32 -> 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  )
ResNet34V1bSyncBN(
  (conv1): Conv2D(None -> 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): SyncBatchNorm(eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, ndev=4, key='syncbatchnorm163_', in_channels=None)
  (relu): Activation(relu)
  (maxpool): MaxPool2D(size=(3, 3), stride=(2, 2), padding=(1, 1), ceil_mode=False)

looking forward to ur reply.Thx a lot :slight_smile:

@chrisluu
You can replace net.conv1 with a brand new HybridSequential, add to it your new conv layer, and then copy all the subsequent layers of the original resnet HybridSequentiual into it.

@ThomasDelteil Thx for ur kindly replies, extremely appreciated!

1 Like