Label and data in the same csv


#1

I have two questions:

  1. Does anyone know how to coax mxnet to handle csv files where the data and the label are in the same file?
  2. If not, will someone help me understand the rationale for having separate files?

My problem is, I’m dealing with close to 500 million rows and the overhead to generate carefully matched data and label files is prohibitive.

I’ve tried a number of things:

  1. slice the label from the data tensor
  2. read in the csv twice (same path for data and label_csv args) and slice out the label

When I do 1, I see:
File “”, line 1, in
File “/usr/local/lib/python2.7/dist-packages/mxnet/module/base_module.py”, line 498, in fit
for_training=True, force_rebind=force_rebind)
File “/usr/local/lib/python2.7/dist-packages/mxnet/module/module.py”, line 429, in bind
state_names=self._state_names)
File “/usr/local/lib/python2.7/dist-packages/mxnet/module/executor_group.py”, line 279, in init
self.bind_exec(data_shapes, label_shapes, shared_group)
File “/usr/local/lib/python2.7/dist-packages/mxnet/module/executor_group.py”, line 382, in bind_exec
self._collect_arrays()
File “/usr/local/lib/python2.7/dist-packages/mxnet/module/executor_group.py”, line 319, in _collect_arrays
for name, _ in self.label_shapes]
KeyError: ‘softmax_label’

When I do 2, I see:
Error in operator labels1: [21:36:06] src/operator/tensor/./matrix_op-inl.h:1115: Check failed: *axis < static_cast<int>(ishape.ndim()) && *axis >= 0 Transformed axis must be smaller than the source ndim and larger than zero! Recieved axis=1, src_ndim=0, transformed axis=1

Code below:

import mxnet as mx

n_uuid = 147291
n_waves = 16
n_sakey = 1632
n_svkey = 11423
n_types = 2
batch_size = 256
dropoutP = 0.5
train_iter = mx.io.CSVIter(data_csv=‘mxnet/input/train.csv’, data_shape=(6,), label_csv=’‘mxnet/input/train.csv’, label_shape=(6,), batch_size=batch_size)
test_iter = mx.io.CSVIter(data_csv=’‘mxnet/input/test.csv’, data_shape=(6,), label_shape=(1,), batch_size=batch_size)

modelName = “mxnet/testing”

data = mx.symbol.Variable(‘data’)
label = mx.symbol.Variable(‘label’)
uuid = mx.symbol.Variable(‘uuid’)
uuid = mx.symbol.Embedding(data=data.slice_axis(axis=1, begin=2, end=3), input_dim=n_uuid, output_dim=128, name = ‘uuid_embedding’)
svkey = mx.symbol.Variable(“svkey”)
svkey = mx.symbol.Embedding(data=data.slice_axis(axis=1, begin=5, end=6), input_dim=n_svkey, output_dim=128, name = ‘svkey_embedding’)
instance_weight = data.slice_axis(axis=1, begin=0, end=1)
labels1 = mx.symbol.slice_axis(data = label, axis=1, begin=1, end=2, name= “labels1”)
labels2 = mx.symbol.reshape(data = labels1, shape = (-1), name = “labels2”)
nn2 = mx.symbol.concat(uuid, svkey)
nn2 = mx.symbol.flatten(nn2)
nn2 = mx.symbol.Dropout(p=dropoutP, mode=‘training’, data=nn2) # Try with always-on input-dropout here
nn2 = mx.symbol.FullyConnected(data=nn2, num_hidden=64)
nn2 = mx.symbol.BatchNorm(data=nn2) # First batch norm layer here, before the activaton!
nn2 = mx.symbol.LeakyReLU(data=nn2, act_type=‘elu’, slope=0.10)
nn2 = mx.symbol.FullyConnected(data=nn2, num_hidden=64)
out = mx.symbol.SoftmaxOutput(nn2, name = ‘softmax’, label = labels2)
model2 = mx.module.Module(out, context=[mx.cpu()])#, label_names = [‘softmax_labels’])
model2.fit(train_iter, num_epoch=2, optimizer=‘adam’, optimizer_params=((‘learning_rate’, 0.001),), eval_metric=[‘ce’,‘acc’], eval_data=test_iter, batch_end_callback=mx.callback.Speedometer(batch_size, 5000), epoch_end_callback = mx.callback.do_checkpoint(modelName, 1))


#2

You can easily write your own DataIter child class to split label from data. In the following example, I’m assuming you have 3 columns, two data features (data0 and data1) and one label:

class MyCsvIter(mx.io.DataIter):
    def __init__(self, fname, batch_size, n_col):
        super(MyCsvIter, self).__init__()
        self.batch_size = batch_size
        self.n_col = n_col
        self.csv_iter = mx.io.CSVIter(data_csv=fname,
                                      data_shape=(n_col,), 
                                      batch_size=batch_size,
                                      round_batch=True)
        data0_desc = mx.io.DataDesc('data0', (batch_size,))
        data1_desc = mx.io.DataDesc('data1', (batch_size,))
        label_desc = mx.io.DataDesc('label', (batch_size,))
        self._provide_data = [data0_desc, data1_desc]
        self._provide_label = [label_desc]

    def __iter__(self):
        return self

    def reset(self):
        self.csv_iter.reset()

    def __next__(self):
        return self.next()

    @property
    def provide_data(self):
        return self._provide_data

    @property
    def provide_label(self):
        return self._provide_label

    def next(self):
        batch = self.csv_iter.next()
        data0, data1, label = mx.nd.split(batch.data[0], 3)
        return mx.io.DataBatch([data0, data1], [label])