I’ve been following the guides for working with the gluon API on multiple GPUs but I’m running into memory errors when attempting to sum the correct number of predictions during my validation loop:
mxnet.base.MXNetError: [15:24:56] src/storage/./pooled_storage_manager.h:143: cudaMalloc failed: out of memory
I’m still running out of memory when reducing the batch size to 2 and the training loop seems to work fine so I must be doing something wrong but haven’t been able to figure it out.
If I try waiting for the computations before printing a single prediction, I hit the error so presumably the real problem is happening with the mx.nd.argmax(net(X), axis=1) for X in data
part of the code. I had hoped turning it into a generator would help, but it doesn’t seem to.
def valid_batch(data_it, label_it, ctx, net):
data = gluon.utils.split_and_load(data_it, ctx)
preds = (mx.nd.argmax(net(X), axis=1) for X in data)
for pred in preds:
print("datum", type(pred), pred.shape, pred.size)
pred.wait_to_read() # Error occurs here
print(pred)
The shape of each NDArray in data is (4, 3, 480, 640).
If it helps, here’s all of the code together:
class DataIterLoader():
def __init__(self, data_iter):
self.data_iter = data_iter
def __iter__(self):
self.data_iter.reset()
return self
def __next__(self):
batch = self.data_iter.__next__()
assert len(batch.data) == len(batch.label) == 1
data = batch.data[0]
label = batch.label[0]
return data, label
def next(self):
return self.__next__() # for Python 2
def forward_backward(net, data, label):
with mx.autograd.record():
losses = [loss_fn(net(X), Y) for X, Y in zip(data, label)]
for l in losses:
l.backward()
def train_batch(data_it, label_it, ctx, net, trainer):
# Split the data batch and load them on GPUs
data = gluon.utils.split_and_load(data_it, ctx)
label = gluon.utils.split_and_load(label_it, ctx)
# Compute gradient
forward_backward(net, data, label)
# Update parameters
trainer.step(data_it.shape[0])
def valid_batch(data_it, label_it, ctx, net):
data = gluon.utils.split_and_load(data_it, ctx)
#labels = gluon.utils.split_and_load(label_it, ctx)
preds = (mx.nd.argmax(net(X), axis=1) for X in data)
for pred in preds:
print("datum", type(pred), pred.shape, pred.size)
pred.wait_to_read() # out of memory error here
print(pred)
break
# Load the RGB means for the training set, then determine the batch
# size
means = json.loads(open(args["means"]).read())
bat_size = config.BATCH_SIZE * args["num_devices"]
train_iter = mx.io.ImageRecordIter(
path_imgrec=config.TRAIN_MX_REC,
data_shape=(3, 480, 640),
batch_size=bat_size,
#rand_crop=True,
rand_mirror=True,
#rotate=15,
#max_shear_ratio=0.1,
mean_r=means["R"],
mean_g=means["G"],
mean_b=means["B"],
preprocess_threads=args["num_devices"] * 2
)
val_iter = mx.io.ImageRecordIter(
path_imgrec=config.VAL_MX_REC,
data_shape=(3, 480, 640),
batch_size=bat_size,
mean_r=means["R"],
mean_g=means["G"],
mean_b=means["B"]
)
train_iter_loader = DataIterLoader(train_iter)
val_iter_loader = DataIterLoader(val_iter)
# Construct the checkpoints path
checkpoints_path = os.path.sep.join([args["checkpoints"],
args["prefix"]])
# If there is no specific model starting epoch supplied, then
# initialize the network
if args["start_epoch"] <= 0:
# Build the VGGNet architecture
print("[INFO] Building network...")
model = VGG19()
# Otherwise, a specific checkpoint was supplied
else:
# Load the checkpoint from disk
print("[INFO] Loading epoch {}...".format(args["start_epoch"]))
with warnings.catch_warnings():
warnings.simplefilter("ignore")
# Figure out checkpoint filename
pad = 4 - len(str(args["start_epoch"]))
zeroes = "0" * pad
fname = args["prefix"] + "-" + zeroes + str(args["start_epoch"])
# Load our model
model = gluon.SymbolBlock.imports(args["prefix"] + "-symbol.json", ["data"], fname)
ctx = [mx.gpu(i) for i in range(0, args["num_devices"])]
model.initialize(mx.initializer.MSRAPrelu(), ctx=ctx)
model.hybridize()
trainer = gluon.Trainer(model.collect_params(), "sgd", {"learning_rate": args["learning_rate"]})
# Define our loss function
loss_fn = gluon.loss.SoftmaxCrossEntropyLoss()
# Train the network
print("[INFO] Training network...")
for epoch in range(args["end_epoch"]):
# Training Loop
start = time()
for d, l in train_iter_loader: # start of mini-batch
train_batch(d, l, ctx, model, trainer)
mx.nd.waitall() # Wait until all computations are finished to benchmark the time
print("[Epoch {}] Training Time = {:.1f} sec".format(epoch, time() - start))
# Validation loop
correct, num = 0.0, 0.0
for d, l in val_iter_loader:
correct += valid_batch(d, l, ctx, model)
num += d.shape[0]
mx.nd.waitall()
print("\tValidation Accuracy = {:.2f}".format(correct / num * 100))
# Save a checkpoint
path = os.path.sep.join([checkpoints_path, args["prefix"]])
print("Saving checkpoint file {} to {}...".format(path, checkpoints_path))
model.export(path, epoch=epoch)
And the full stack trace:
Traceback (most recent call last):
File "train_vggnet.py", line 172, in <module>
correct += valid_batch(d, l, ctx, model)
File "train_vggnet.py", line 91, in valid_batch
print(pred)
File "/uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/ndarray/ndarray.py", line 189, in __repr__
return '\n%s\n<%s %s @%s>' % (str(self.asnumpy()),
File "/uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/ndarray/ndarray.py", line 1980, in asnumpy
ctypes.c_size_t(data.size)))
File "/uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/base.py", line 252, in check_call
raise MXNetError(py_str(_LIB.MXGetLastError()))
mxnet.base.MXNetError: [15:44:44] src/storage/./pooled_storage_manager.h:143: cudaMalloc failed: out of memory
Stack trace returned 10 entries:
[bt] (0) /uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x40123a) [0x2b9be9d3523a]
[bt] (1) /uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x401851) [0x2b9be9d35851]
[bt] (2) /uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x340f493) [0x2b9becd43493]
[bt] (3) /uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x341390e) [0x2b9becd4790e]
[bt] (4) /uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/libmxnet.so(void mxnet::CopyFromToDnsImpl<mshadow::gpu, mshadow::gpu>(mxnet::NDArray const&, mxnet::NDArray const&, mxnet::RunContext)+0x33a) [0x2b9bec7f85ca]
[bt] (5) /uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/libmxnet.so(void mxnet::CopyFromToImpl<mshadow::gpu, mshadow::gpu>(mxnet::NDArray const&, mxnet::NDArray const&, mxnet::RunContext, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&)+0x45d) [0x2b9bec81098d]
[bt] (6) /uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x2edcaab) [0x2b9bec810aab]
[bt] (7) /uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x2cb1494) [0x2b9bec5e5494]
[bt] (8) /uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x2cb8533) [0x2b9bec5ec533]
[bt] (9) /uufs/chpc.utah.edu/common/home/u6000791/venv/rana/lib/python3.5/site-packages/mxnet/libmxnet.so(+0x2cb8786) [0x2b9bec5ec786]