Hi, I’d like to showcase benefit of mixed precision training. I have this simple net:

```
def BuildNet():
net = gluon.nn.HybridSequential()
with net.name_scope():
net.add(gluon.nn.Conv2D(channels=20, kernel_size=3, activation='relu'))
net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
net.add(gluon.nn.Conv2D(channels=50, kernel_size=3, activation='relu'))
net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
# The Flatten layer collapses all axis, except the first one, into one axis.
net.add(gluon.nn.Flatten())
net.add(gluon.nn.Dense(num_fc, activation="relu"))
net.add(gluon.nn.Dropout(.3))
net.add(gluon.nn.Dense(num_outputs))
return net
```

**This runs fine:**

```
net = BuildNet()
# Parameter initialization
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
# Softmax cross-entropy loss
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
# Optimizer
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': .1})
# Training loop
epochs = 3
smoothing_constant = .01
curr_loss = mx.nd.zeros((1,), ctx=ctx)
for e in range(epochs):
tick = time.time()
for i, (data, label) in enumerate(train_data):
data = data.as_in_context(ctx)
label = label.as_in_context(ctx)
with autograd.record():
output = net(data)
loss = softmax_cross_entropy(output, label)
loss.backward()
trainer.step(data.shape[0])
##########################
# Keep a moving average of the losses
##########################
curr_loss += nd.mean(loss)
test_accuracy = evaluate_accuracy(test_data, net)
train_accuracy = evaluate_accuracy(train_data, net)
print("Epoch {}. Loss: {}, Train_acc {}, Test_acc {}, {:.4f}"
.format(e, curr_loss.asscalar()/len(train_data), train_accuracy, test_accuracy, time.time()-tick))
```

**This errors:**

Only changes are casting the net and the data to float16.

```
net = BuildNet()
# Parameter initialization
net.collect_params().initialize(mx.init.Xavier(magnitude=2.24), ctx=ctx)
net.cast('float16')
# Softmax cross-entropy loss
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
# Optimizer
trainer = gluon.Trainer(
params=net.collect_params(),
optimizer='sgd',
optimizer_params={'learning_rate': .1,
'multi_precision': True})
# Training loop
epochs = 3
smoothing_constant = .01
curr_loss = mx.nd.zeros((1,), ctx=ctx)
for e in range(epochs):
tick = time.time()
for i, (data, label) in enumerate(train_data):
data = data.as_in_context(ctx).astype('float16')
label = label.as_in_context(ctx).astype('float16')
with autograd.record():
output = net(data)
loss = softmax_cross_entropy(output, label)
loss.backward()
trainer.step(data.shape[0])
##########################
# Keep a moving average of the losses
##########################
curr_loss += nd.mean(loss)
test_accuracy = evaluate_accuracy(test_data, net)
train_accuracy = evaluate_accuracy(train_data, net)
print("Epoch {}. Loss: {}, Train_acc {}, Test_acc {}, {:.4f}"
.format(e, curr_loss.asscalar()/len(train_data), train_accuracy, test_accuracy, time.time()-tick))
```

I followed this https://mxnet.incubator.apache.org/faq/float16.html quite carefully.

**What is wrong?**

Error is

`MXNetError: [18:50:32] src/operator/contrib/../elemwise_op_common.h:133: Check failed: assign(&dattr, (*vec)[i]) Incompatible attr in node at 1-th input: expected float32, got float16`