About multi label train in mxnet, crossentroy.py or LogisticRegressionOutPut? A strange result!

I just use the crossentropy.py in https://github.com/miraclewkf/multilabel-MXNet.git to train multi-label classifier(1000 classes), but I found that the inference result is always strange
And the crossentropy loss is also located in example/recommenders in mxnet source.

(1)The result score is very low, and a little number of output is smaller than 0.5
(2)when I change actually_calculate_loss = True, then the train loss is nan.

follow is the custom acc and loss:

135 def acc(label, pred, label_width = num_class):
136 return float((label == np.round(pred)).sum()) / label_width / pred.shape[0]
137
138 def loss(label, pred):
139 loss_all = 0
140 for i in range(len(pred)):
141 loss = 0
142 loss -= label[i] * np.log(pred[i] + 1e-6) + (1.- label[i]) * np.log(1. + 1e-6 - pred[i])
143 loss_all += np.sum(loss)
144 loss_all = float(loss_all)/float(len(pred) + 0.000001)
145 return loss_all

and follow is the loss layer implement:

44 eps = 1e-6 # Avoid -inf when taking log(0)
45 eps1 = 1. + eps
46 eps_1 = 1. - eps
47
48 def forward(self, is_train, req, in_data, out_data, aux):
49 # Shapes:
50 # b = minibatch size
51 # d = number of dimensions
52 actually_calculate_loss = False
53 if actually_calculate_loss:
54 p = in_data[0].asnumpy() # shape=(b,d)
55 y = in_data[1].asnumpy()
56 out = y * np.log(p+self.eps) + (1.-y) * np.log((self.eps1) - p)
57 self.assign(out_data[0], req[0], mx.nd.array(out))
58 else:
59 # Just copy the predictions forward
60 self.assign(out_data[0], req[0], in_data[0])
61
62
63 def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
64 self.approx_backward(req, out_grad, in_data, out_data, in_grad, aux)
65 #self.exact_backward(req, out_grad, in_data, out_data, in_grad, aux)
66
67 def approx_backward(self, req, out_grad, in_data, out_data, in_grad, aux):
68 “”“Correct grad = (y-p)/(p-p^2)
69 But if y is just 1 or 0, then this simplifies to
70 grad = 1/(p-1+y)
71 which is more numerically stable
72 “””
73 p = in_data[0].asnumpy() # shape=(b,d)
74 y = in_data[1].asnumpy()
75 grad = -1. / (p - self.eps_1 + y)
76 self.assign(in_grad[0], req[0], mx.nd.array(grad))
77
78
79 def exact_backward(self, req, out_grad, in_data, out_data, in_grad, aux):
80 “”“grad = (y-p)/(p-p^2)
81 “””
82 p = in_data[0].asnumpy() # shape=(b,d)
83 y = in_data[1].asnumpy() # seems right
84 grad = (p - y) / ((p+self.eps) * (self.eps1 - p))
85 self.assign(in_grad[0], req[0], mx.nd.array(grad))

and folow is the train code:

20 def get_fine_tune_model(sym, arg_params, num_classes, layer_name):
21
22 all_layers = sym.get_internals()
23 net = all_layers[layer_name+’_output’]
24 net = mx.symbol.FullyConnected(data=net, num_hidden=num_classes, name=‘fc1’)
25 net = mx.symbol.sigmoid(data=net, name=‘sig’)
26 net = mx.symbol.Custom(data=net, name=‘softmax’, op_type=‘CrossEntropyLoss’)
27 #net = mx.symbol.LogisticRegressionOutput(data=net, name=‘softmax’)
28
29 #new_args = dict({k:arg_params[k] for k in arg_params if ‘fc’ not in k})
30 #return (net, new_args)
31 return (net, arg_params)
32 #return net

34 def multi_factor_scheduler(begin_epoch, epoch_size, step=[5,10], factor=0.1):
35 step_ = [epoch_size * (x-begin_epoch) for x in step if x-begin_epoch > 0]
36 return mx.lr_scheduler.MultiFactorScheduler(step=step_, factor=factor) if len(step_) else None

65 train = mx.io.ImageRecordIter(
66 path_imgrec = train_data,
67 label_width = num_class,
68 data_shape = (3, 224, 224),
69 batch_size = args.batch_size,
70 rand_crop = True,
82 preprocess_threads = 20,
83 rand_mirror = True,
84 shuffle = True,
85 num_parts = kv.num_workers,
86 part_index = kv.rank)

90 val = mx.io.ImageRecordIter(
91 path_imgrec = val_data,
92 label_width = num_class,
93 batch_size = args.batch_size,
94 data_shape = (3, 224, 224),
98 rand_crop = False,
99 rand_mirror = False,
100 num_parts = kv.num_workers,
101 part_index = kv.rank)
102
103 kv = mx.kvstore.create(args.kv_store)
104
105 prefix = model
106 sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)

108 (new_sym,new_args) = get_fine_tune_model(
109 sym, arg_params, args.num_classes, ‘flatten0’)
110
111 epoch_size = max(int(args.num_examples / args.batch_size / kv.num_workers), 1)
112 lr_scheduler=multi_factor_scheduler(args.epoch, epoch_size, step=[60,120,180])
113
114 optimizer_params = {
115 ‘learning_rate’: args.lr,
116 ‘momentum’ : args.mom,
117 ‘wd’ : args.wd,
118 ‘lr_scheduler’: lr_scheduler}
119 initializer = mx.init.Xavier(
120 rnd_type=‘gaussian’, factor_type=“in”, magnitude=2)
121
122 if gpus == ‘’:
123 devs = mx.cpu()
124 else:
125 devs = [mx.gpu(int(i)) for i in gpus.split(’,’)]
126
127 model = mx.mod.Module(
128 context = devs,
129 symbol = new_sym,
130
131 )
132
133 checkpoint = mx.callback.do_checkpoint(args.save_result+args.save_name)
134
135 def acc(label, pred, label_width = num_class):
136 return float((label == np.round(pred)).sum()) / label_width / pred.shape[0]
137
138 def loss(label, pred):
139 loss_all = 0
140 for i in range(len(pred)):
141 loss = 0
142 loss -= label[i] * np.log(pred[i] + 1e-6) + (1.- label[i]) * np.log(1. + 1e-6 - pred[i])
143 loss_all += np.sum(loss)
144 loss_all = float(loss_all)/float(len(pred) + 0.000001)
145 return loss_all
146
147
148 eval_metric = list()
149 eval_metric.append(mx.metric.np(acc))
150 eval_metric.append(mx.metric.np(loss))

152 model.fit(train,
153 begin_epoch=epoch,
154 num_epoch=num_epoch,
155 eval_data=val,
156 eval_metric=eval_metric,
157 validation_metric=eval_metric,
158 kvstore=kv,
159 optimizer=‘sgd’,
160 optimizer_params=optimizer_params,
161 arg_params=new_args,
162 aux_params=aux_params,
163 initializer=initializer,
164 allow_missing=True,
165 batch_end_callback=mx.callback.Speedometer(args.batch_size, 20),
166 epoch_end_callback=checkpoint)

Does it has an error? Thank you very much for indication!

(3) When I use LogisticRegressionOutput, it has a lower inference score

I am not sure where the error is, but I wonder is there something stops you from using SigmoidBinaryCrossEntropyLoss? It looks to me that the code in crossentropy.py in combination with the model tries to do something similar, but it uses numpy which means that it won’t be able to run on GPU.

The cross entropy function in examples should work faster, as it doesn’t use numpy in the operator itself.

Which dataset are you using?

thank you very much, I am using openimage.