@TaoLv, thanks for your reply!
In the environment where I was getting the error reported in the first post, I have:
mxnet-mkl -> 1.6.0
gluoncv -> 0.6.0
Also, I created a new environment where I installed the nightly version as you suggested, and added gluoncv by the usual pip install --upgrade gluoncv…so, in this environment now I have:
mxnet -> 2.0.0b20200513
gluoncv -> 0.7.0
but I’m still getting the same error:
File "../src/ndarray/ndarray.cc", line 840
MXNetError: Check failed: !IsMKLDNNData(): We can't generate TBlob for MKLDNN data. Please use Reorder2Default() to generate a new NDArray first
The error is returned from the line:
sum_loss, cls_loss, box_loss = mbox_loss(cls_preds, box_preds, cls_targets, box_targets)
in the code:
import mxnet as mx
from mxnet import autograd, gluon
import gluoncv as gcv
from gluoncv import data as gdata
from gluoncv.utils import download, viz
from gluoncv.data.batchify import Tuple, Stack, Pad
from gluoncv.data.transforms.presets.ssd import SSDDefaultValTransform
from gluoncv.data.transforms.presets.ssd import SSDDefaultTrainTransform
from gluoncv.utils.metrics.coco_detection import COCODetectionMetric
from gluoncv.utils.metrics.voc_detection import VOC07MApMetric
from mxboard import SummaryWriter
def main():
ctx = [mx.cpu()]
model_name = "ssd_512_resnet50_v1_coco"
classes = ['Guitar', 'human_face']
batch_size = 8
net = gcv.model_zoo.get_model(model_name, pretrained=True)
net.reset_class(classes)
saved_weights_path = "saved_weights/random_test/"
# prepare data
data_shape = 512
train_dataset = gcv.data.RecordFileDetection('custom_dataset/train_guitar_face_00.rec', coord_normalized=True)
val_dataset = gcv.data.RecordFileDetection('custom_dataset/test_guitar_face_00.rec', coord_normalized=True)
# summary file for tensorboard
sw = SummaryWriter(logdir=saved_weights_path+'logs/', flush_secs=30)
eval_metric = VOC07MApMetric(iou_thresh=0.5, class_names=classes)
# create data batches from dataset (net, train_dataset, data_shape, batch_size, num_workers):
train_data, val_data = get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, num_workers=0)
# ---------------------
# Training
# ---------------------
# configuration
net.collect_params().reset_ctx(ctx)
trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.001, 'wd': 0.0005, 'momentum': 0.9})
mbox_loss = gcv.loss.SSDMultiBoxLoss()
ce_metric = mx.metric.Loss('CrossEntropy')
smoothl1_metric = mx.metric.Loss('SmoothL1')
for epoch in range(0, 100):
ce_metric.reset()
smoothl1_metric.reset()
tic = time.time()
btic = time.time()
net.hybridize(static_alloc=True, static_shape=True)
loss1 = 0
loss2 = 0
name1 = ''
name2 = ''
for i, batch in enumerate(train_data):
batch_size = batch[0].shape[0]
data = gluon.utils.split_and_load(batch[0], ctx_list=ctx, batch_axis=0)
cls_targets = gluon.utils.split_and_load(batch[1], ctx_list=ctx, batch_axis=0)
box_targets = gluon.utils.split_and_load(batch[2], ctx_list=ctx, batch_axis=0)
with autograd.record():
cls_preds = []
box_preds = []
for x in data:
cls_pred, box_pred, _ = net(x)
cls_preds.append(cls_pred)
box_preds.append(box_pred)
sum_loss, cls_loss, box_loss = mbox_loss(cls_preds, box_preds, cls_targets, box_targets)
autograd.backward(sum_loss)
# since we have already normalized the loss, we don't want to normalize by batch-size anymore
trainer.step(1)
ce_metric.update(0, [l * batch_size for l in cls_loss])
smoothl1_metric.update(0, [l * batch_size for l in box_loss])
name1, loss1 = ce_metric.get()
name2, loss2 = smoothl1_metric.get()
if i % 20 == 0:
print('[Epoch {}][Batch {}], Speed: {:.3f} samples/sec, {}={:.3f}, {}={:.3f}'.format(
epoch, i, batch_size / (time.time() - btic), name1, loss1, name2, loss2))
btic = time.time()
print(f"\nValidation accuracies for epoch: {epoch}")
for k,v in zip(map_name, mean_ap):
sw.add_scalar(tag=f'{k}_val_acc', value=v, global_step=epoch)
print(f"\t{k}={v}")
log_file.write(f"{v:15.5}")
log_file.write("\n")
# save model parameters for inference every 5 epochs
if epoch%5==0:
net.export(f"{saved_weights_path}", epoch=epoch)
sw.close()
def get_dataloader(net, train_dataset, val_dataset, data_shape, batch_size, num_workers):
[...]
def validate(net, val_data, ctx, eval_metric):
[...]
if __name__ == "__main__":
main()