Check failed: e == CUDNN_STATUS_SUCCESS (4 vs. 0) cuDNN: CUDNN_STATUS_INTERNAL_ERROR


#1

it occurs when test on gpu and bind, for_training=false,

update…
it not caused by the out of memory
i use the code of resnet38 for segmentation https://github.com/itijyou/ademxapp
the definition of conv is

def _attr_scope_lr(lr_type, lr_owner):
assert lr_type in (‘alex’, ‘alex10’, ‘torch’)
# weight (lr_mult, wd_mult); bias;
# 1, 1; 2, 0;
if lr_type == ‘alex’:
if lr_owner == ‘weight’:
return mx.AttrScope()
elif lr_owner == ‘bias’:
return mx.AttrScope(lr_mult=‘2.’, wd_mult=‘0.’)
else:
assert False
# 10, 1; 20, 0;
if lr_type == ‘alex10’:
if lr_owner == ‘weight’:
return mx.AttrScope(lr_mult=‘10.’, wd_mult=‘1.’)
elif lr_owner == ‘bias’:
return mx.AttrScope(lr_mult=‘20.’, wd_mult=‘0.’)
else:
assert False
# 0, 0; 0, 0;
# so apply this to both
if lr_type == ‘fixed’:
assert lr_owner in (‘weight’, ‘bias’)
return mx.AttrScope(lr_mult=‘0.’, wd_mult=‘0.’)
# 1, 1; 1, 1;
# so do nothing
return mx.AttrScope()
def conv(data, name, filters, kernel=3, stride=1, dilate=1, pad=-1,
groups=1, no_bias=False, workspace=-1):
if kernel == 1:
# set dilate to 1, since kernel is 1
dilate = 1
if pad < 0:
assert kernel % 2 == 1, ‘Specify pad for an even kernel size’
pad = ((kernel - 1) * dilate + 1) // 2
if workspace < 0:
workspace = cfg.get(‘workspace’, 512)
lr_type = cfg.get(‘lr_type’, ‘torch’)
with _attr_scope_lr(lr_type, ‘weight’):
weight = mx.sym.Variable(’{}_weight’.format(name))
if no_bias:
return mx.sym.Convolution(data=data, weight=weight, name=name,
kernel=(kernel, kernel),
stride=(stride, stride),
dilate=(dilate, dilate),
pad=(pad, pad),
num_filter=filters,
num_group=groups,
workspace=workspace,
no_bias=True)
else:
with _attr_scope_lr(lr_type, ‘bias’):
bias = mx.sym.Variable(’{}_bias’.format(name))
return mx.sym.Convolution(data=data, weight=weight, bias=bias, name=name,
kernel=(kernel, kernel),
stride=(stride, stride),
dilate=(dilate, dilate),
pad=(pad, pad),
num_filter=filters,
num_group=groups,
workspace=workspace,
no_bias=False)

and my code snippt is

import sys
sys.path.insert(0, ‘./’)
import numpy as np
import mxnet as mx

data=mx.sym.Variable(‘data’)
input_h=1024
input_w=2048
dataiter = mx.io.NDArrayIter({‘data’:np.zeros((1, 3,input_h,input_w)),}, batch_size=1)
from util.symbol.symbol import conv, cfg
cfg[‘lr_type’] = 'alex’
cfg[‘workspace’]=1650
cfg[‘bn_use_global_stats’]=True
net = conv(data, ‘conv1a’,64,kernel=3,stride=1,no_bias=True)
i=0
ctx = [mx.gpu(0), mx.cpu()][i]
mod=mx.mod.Module(net, data_names=(‘data’,), label_names=(), context=ctx)
print(mod)
data_shapes=[mx.io.DataDesc(‘data’,[1,3,1024,2048])]
print(dataiter.provide_data)
mod.bind(data_shapes=dataiter.provide_data,for_training=False,force_rebind=True)
print(‘end’)

if i use cpu or move the definition to cur file as below, everything is fine.
if kernel == 1:
# set dilate to 1, since kernel is 1
dilate = 1
if pad < 0:
assert kernel % 2 == 1, ‘Specify pad for an even kernel size’
pad = ((kernel - 1) * dilate + 1) // 2
if workspace < 0:
workspace = cfg.get(‘workspace’, 512)
lr_type = cfg.get(‘lr_type’, ‘torch’)
with _attr_scope_lr(lr_type, ‘weight’):
weight = mx.sym.Variable(’{}_weight’.format(name))
if no_bias:
net=mx.sym.Convolution(data=data, weight=weight, name=name,
kernel=(kernel, kernel),
stride=(stride, stride),
dilate=(dilate, dilate),
pad=(pad, pad),
num_filter=filters,
num_group=groups,
workspace=workspace,
no_bias=True)
sorry for the ugly code, i don’t know how to place it as code
now i am still not know why could this.


#2

Could you please provide a small example that reproduces the error?


#3

very likely GPU out of memory issue.


#4

Note that there might be other processes taking GPU memory. For example, a Jupyter kernel could take a lot of memory even when nothing is actively running.


#5

sorry, it is not the code issue,it caused by the memory not enough. I changed the scale of images, it worked.