This exception was thrown while I ran my code on using my fine tuned pre trained model raise MXNetError(py_str(_LIB.MXGetLastError())) mxnet.base.MXNetError


#1

Here’s the full traceback:
Traceback (most recent call last):
File “orahi_eval.py”, line 22, in
embedding = Embedding(‘orahiimages’,0, 0)
File “…/recognition/embedding.py”, line 24, in init
model.set_params(arg_params, aux_params)
File “/home/anshuman/anaconda3/lib/python3.6/site-packages/mxnet/module/module.py”, line 350, in set_params
allow_extra=allow_extra)
File “/home/anshuman/anaconda3/lib/python3.6/site-packages/mxnet/module/module.py”, line 309, in init_params
_impl(desc, arr, arg_params)
File “/home/anshuman/anaconda3/lib/python3.6/site-packages/mxnet/module/module.py”, line 297, in _impl
cache_arr.copyto(arr)
File “/home/anshuman/anaconda3/lib/python3.6/site-packages/mxnet/ndarray/ndarray.py”, line 1970, in copyto
return _internal._copyto(self, out=other)
File “”, line 25, in _copyto
File “/home/anshuman/anaconda3/lib/python3.6/site-packages/mxnet/_ctypes/ndarray.py”, line 92, in _imperative_invoke
ctypes.byref(out_stypes)))
File “/home/anshuman/anaconda3/lib/python3.6/site-packages/mxnet/base.py”, line 149, in check_call
raise MXNetError(py_str(_LIB.MXGetLastError()))
mxnet.base.MXNetError: [12:56:33] src/operator/nn/…/tensor/…/elemwise_op_common.h:123: Check failed: assign(&dattr, (*vec)[i]) Incompatible attr in node at 0-th output: expected [1], got [20]

Here’s my code for fine tuning:
import sys
sys.path.insert(0, ‘/home/anshuman/Documents/Repos/insightface/deploy’)#to ensure face_model
import face_model
import argparse
import cv2
import numpy as np
import logging
import mxnet as mx
head = ‘%(asctime)-15s %(message)s’
logging.basicConfig(level=logging.DEBUG, format=head)

def get_iterators(batch_size, data_shape=(3, 112, 112)):
    train = mx.io.ImageRecordIter(
        path_imgrec         = '../datasets/img/saved_faces/orahiimagedb_train.rec',
        data_name           = 'data',
        label_name          = 'softmax_label',
        batch_size          = batch_size,
        data_shape          = data_shape
        )
    val = mx.io.ImageRecordIter(
        path_imgrec         = '../datasets/img/saved_faces/orahiimagedb_val.rec',
        data_name           = 'data',
        label_name          = 'softmax_label',
        batch_size          = batch_size,
        data_shape          = data_shape,
        )
    return (train, val)
def get_model(ctx, image_size, model_str, layer):
  _vec = model_str.split(',')
  assert len(_vec)==2
  prefix = _vec[0]
  epoch = int(_vec[1])
  print('Loading the Model... Please Wait \n',prefix, epoch)
  sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
  all_layers = sym.get_internals()
  sym = all_layers[layer+'_output']
  model = mx.mod.Module(symbol=sym, context=ctx, label_names = None)
  #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
  model.bind(data_shapes=[('data', (1, 3, image_size[0], image_size[1]))])
  model.set_params(arg_params, aux_params)
  return model,sym,arg_params,aux_params
def fit(symbol, arg_params, aux_params, train, val, batch_size, num_gpus):
    devs = [mx.gpu(i) for i in range(num_gpus)]
    mod = mx.mod.Module(symbol=symbol, context=devs)
    mod.fit(train, val,
        num_epoch=20,
        arg_params=arg_params,
        aux_params=aux_params,
        allow_missing=True,
        batch_end_callback = mx.callback.Speedometer(batch_size, 10),
        kvstore='device',
        optimizer='sgd',
        optimizer_params={'learning_rate':0.01},
        initializer=mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2),
        eval_metric='acc')
    metric = mx.metric.Accuracy()
    return mod.score(val, metric)
def get_fine_tune_model(symbol, arg_params, num_classes, layer_name='dropout0'):#need to figure out the last layer name
    """
    symbol: the pretrained network symbol
    arg_params: the argument parameters of the pretrained model
    num_classes: the number of classes for the fine-tune datasets
    layer_name: the layer name before the last fully-connected layer
    """
    all_layers = symbol.get_internals()
    net = all_layers[layer_name+'_output']
    net = mx.symbol.FullyConnected(data=net, num_hidden=num_classes, name='fc1')
    net = mx.symbol.SoftmaxOutput(data=net, name='softmax')
    new_args = dict({k:arg_params[k] for k in arg_params if 'fc1' not in k})
    return (net, new_args)

num_classes = 20
batch_per_gpu = 8
num_gpus = 1
ctx=mx.gpu(0)# to set the context
mod_epoch='../models/model-r34-amf/model/model,0'#cause we are gonna split this by the comma will look into actual epoch and actual values mentioned in default
img_size=(112,112)
layer='fc1'
model,sym, arg_params, aux_params=get_model(ctx,img_size,mod_epoch,layer)
# = mx.model.load_checkpoint(model, 0) improvised check pointing up top
(new_sym, new_args) = get_fine_tune_model(sym, arg_params, num_classes)
batch_size = batch_per_gpu * num_gpus
(train, val) = get_iterators(batch_size)
mod_score = fit(new_sym, new_args, aux_params, train, val, batch_size, num_gpus)
mx.model.save_checkpoint('orahiimages',0,new_sym,new_args,aux_params)
#assert mod_score >0.77, "Low training accuracy."

What does this mean?

Any idea how to fix this?


#2

You’ve got a mismatch between the number of parameters loaded from the checkpoint file, and parameters in your Module’s symbol. You’re working with 20 classes right? But it looks like you’re trying to bind a label with shape (batch_size, 1), so I think this should be (batch_size, num_classes). Give that a try, and let me know how you get on.