Architecture with 2 branches

Hi, I am new to MXNet, and, for my thesis, I want to replicate the following architecture.

Notice the two branches. Depending on the value of t, only one branch is fitted. So if for a sample x, t equals 1, the h1 branch will be trained (analogous for 0).

I have tried the following:

    # Representation Layers
    data = mx.sym.Variable('data', dtype='float32')
    rep_fc1 = mx.sym.FullyConnected(data=data, name='rep_fc1', num_hidden=rep_hidden_size)
    rep_elu1 = mx.sym.Activation(data=rep_fc1, name='rep_elu1', act_type="relu")
    rep_fc2 = mx.sym.FullyConnected(data=rep_elu1, name='rep_fc2', num_hidden=rep_hidden_size)
    rep_relu2 = mx.sym.Activation(data=rep_fc2, name='rep_relu2', act_type="relu")
    rep_fc3 = mx.sym.FullyConnected(data=rep_relu2, name='rep_fc3', num_hidden=rep_hidden_size)
    rep_relu3 = mx.sym.Activation(data=rep_fc3, name='rep_relu3', act_type="relu")

    # Hypothesis Layers for t = 1
    t1_hyp_fc1 = mx.sym.FullyConnected(data=rep_relu3, name='t1_hyp_fc1', num_hidden=hyp_hidden_size)
    t1_hyp_relu1 = mx.sym.Activation(data=t1_hyp_fc1, name='t1_hyp_relu1', act_type="relu")
    t1_hyp_fc2 = mx.sym.FullyConnected(data=t1_hyp_relu1, name='t1_hyp_fc2', num_hidden=hyp_hidden_size)
    t1_hyp_relu2 = mx.sym.Activation(data=t1_hyp_fc2, name='t1_hyp_relu2', act_type="relu")
    t1_hyp_fc3 = mx.sym.FullyConnected(data=t1_hyp_relu2, name='t1_hyp_fc3', num_hidden=hyp_hidden_size)
    t1_hyp_relu3 = mx.sym.Activation(data=t1_hyp_fc3, name='t1_hyp_relu3', act_type="relu")
    t1_hyp_fc4 = mx.sym.FullyConnected(data=t1_hyp_relu3, name='t1_hyp_fc4', num_hidden=1)

    # Hypothesis Layers for t = 0
    t0_hyp_fc1 = mx.sym.FullyConnected(data=rep_relu3, name='t0_hyp_fc1', num_hidden=hyp_hidden_size)
    t0_hyp_relu1 = mx.sym.Activation(data=t0_hyp_fc1, name='t0_hyp_relu1', act_type="relu")
    t0_hyp_fc2 = mx.sym.FullyConnected(data=t0_hyp_relu1, name='t0_hyp_fc2', num_hidden=hyp_hidden_size)
    t0_hyp_relu2 = mx.sym.Activation(data=t0_hyp_fc2, name='t0_hyp_relu2', act_type="relu")
    t0_hyp_fc3 = mx.sym.FullyConnected(data=t0_hyp_relu2, name='t0_hyp_fc3', num_hidden=hyp_hidden_size)
    t0_hyp_relu3 = mx.sym.Activation(data=t0_hyp_fc3, name='t0_hyp_relu3', act_type="relu")
    t0_hyp_fc4 = mx.sym.FullyConnected(data=t0_hyp_relu3, name='t0_hyp_fc4', num_hidden=1)

    rep_net = gluon.SymbolBlock(outputs=[t1_hyp_fc4, t0_hyp_fc4, rep_relu3], inputs=[data])

    return rep_net

Which works “fine”. I compute the combined losses for t=1 and t=0 and the backward that. But this still runs both branches each time.

Would you know of a way, in which only one branch is trained depending on the value of “t”?

Thank you.

There is

  1. BlockGrad symbol. I guess, if you add it to the end of the branch t1, the whole branch will stop training. Or if you plug it earlier, then only the forecoming layers will train.
  2. set_lr_mult parameter.
    I have tried none of these, sorry, so I can’t tell you more now. Sorry if I misunderstood the question, too.

I figured the following is possible. I am not sure if the gradients are correctly backward-ed from each branch.

class CFRNet(nn.HybridBlock):
    def __init__(self, rep_hidden_size, hyp_hidden_size, weight_init_scale, dim_input, **kwards):
        nn.HybridBlock.__init__(self, **kwards)

        self.input_shape = None

        with self.name_scope():
            # Representation Layers
            self.rep_fc1 = nn.Dense(rep_hidden_size,
                                    activation='relu',
                                    weight_initializer=mx.init.Normal(
                                        sigma=weight_init_scale / np.sqrt(dim_input)))
            self.rep_fc2 = nn.Dense(rep_hidden_size,
                                    activation='relu',
                                    weight_initializer=mx.init.Normal(
                                        sigma=weight_init_scale / np.sqrt(rep_hidden_size)))
            self.rep_fc3 = nn.Dense(rep_hidden_size,
                                    activation='relu',
                                    weight_initializer=mx.init.Normal(
                                        sigma=weight_init_scale / np.sqrt(rep_hidden_size)))

            # Hypothesis Layers for t = 1
            self.t1_hyp_fc1 = nn.Dense(hyp_hidden_size,
                                       activation='relu',
                                       weight_initializer=mx.init.Normal(
                                           sigma=weight_init_scale / np.sqrt(rep_hidden_size)))
            self.t1_hyp_fc2 = nn.Dense(hyp_hidden_size,
                                       activation='relu',
                                       weight_initializer=mx.init.Normal(
                                           sigma=weight_init_scale / np.sqrt(hyp_hidden_size)))
            self.t1_hyp_fc3 = nn.Dense(hyp_hidden_size,
                                       activation='relu',
                                       weight_initializer=mx.init.Normal(
                                           sigma=weight_init_scale / np.sqrt(hyp_hidden_size)))
            self.t1_hyp_fc4 = nn.Dense(1)

            # Hypothesis Layers for t = 0
            self.t0_hyp_fc1 = nn.Dense(hyp_hidden_size,
                                       activation='relu',
                                       weight_initializer=mx.init.Normal(
                                           sigma=weight_init_scale / np.sqrt(rep_hidden_size)))
            self.t0_hyp_fc2 = nn.Dense(hyp_hidden_size,
                                       activation='relu',
                                       weight_initializer=mx.init.Normal(
                                           sigma=weight_init_scale / np.sqrt(hyp_hidden_size)))
            self.t0_hyp_fc3 = nn.Dense(hyp_hidden_size,
                                       activation='relu',
                                       weight_initializer=mx.init.Normal(
                                           sigma=weight_init_scale / np.sqrt(hyp_hidden_size)))
            self.t0_hyp_fc4 = nn.Dense(1)

    def forward(self, x, t):
        self.input_shape = x.shape

        return HybridBlock.forward(self, x, t)

    def hybrid_forward(self, F, x, t):
        rep_relu1 = self.rep_fc1(x)
        rep_relu2 = self.rep_fc2(rep_relu1)
        rep_relu3 = self.rep_fc3(rep_relu2)

        t1_hyp_relu1 = self.t1_hyp_fc1(rep_relu3[np.where(t == 1)[0]])
        t1_hyp_relu2 = self.t1_hyp_fc2(t1_hyp_relu1)
        t1_hyp_relu3 = self.t1_hyp_fc3(t1_hyp_relu2)
        t1_hyp_relu4 = self.t1_hyp_fc4(t1_hyp_relu3)

        t0_hyp_relu1 = self.t0_hyp_fc1(rep_relu3[np.where(t == 0)[0]])
        t0_hyp_relu2 = self.t0_hyp_fc2(t0_hyp_relu1)
        t0_hyp_relu3 = self.t0_hyp_fc3(t0_hyp_relu2)
        t0_hyp_relu4 = self.t0_hyp_fc4(t0_hyp_relu3)

        return t1_hyp_relu4, t0_hyp_relu4, rep_relu3

What about arbitrary amounts of Representation and Hypothesis layers?
Clearly hardcoded here to 3.

Would anybody know how to hybridize it?