Cannot run mpi train LeNet on these two machines


#1

hosts:

192.168.137.206
192.168.137.207
export PS_VERBOSE=1
python /root/mxnet/tools/launch.py -n 2 --launcher mpi -H hosts python train_mnist.py  --network lenet --kv-store dist_sync

Sometime MXNET can normal execute, but sometime cannot

the result of normal execution

export PS_VERBOSE=1

[root@localhost image-classification]# python /root/mxnet/tools/launch.py -n 2 --launcher mpi -H hosts python train_mnist.py  --network lenet --kv-store dist_sync
2017-11-05 22:57:30,368 INFO Start 2 workers by mpirun
2017-11-05 22:57:30,374 INFO Start 2 servers by mpirun
[22:57:30] src/van.cc:75: Bind to role=scheduler, id=1, ip=192.168.137.206, port=9103, is_recovery=0
[22:57:31] src/van.cc:75: Bind to role=server, ip=192.168.137.206, port=45510, is_recovery=0
[22:57:31] src/van.cc:75: Bind to role=server, ip=192.168.137.206, port=47152, is_recovery=0
[22:57:31] src/van.cc:75: Bind to role=worker, ip=192.168.137.206, port=50637, is_recovery=0
[22:57:31] src/van.cc:75: Bind to role=worker, ip=192.168.137.206, port=52494, is_recovery=0
[22:57:31] src/van.cc:235: assign rank=8 to node role=server, ip=192.168.137.206, port=45510, is_recovery=0
[22:57:31] src/van.cc:235: assign rank=10 to node role=server, ip=192.168.137.206, port=47152, is_recovery=0
[22:57:31] src/van.cc:235: assign rank=9 to node role=worker, ip=192.168.137.206, port=50637, is_recovery=0
[22:57:31] src/van.cc:235: assign rank=11 to node role=worker, ip=192.168.137.206, port=52494, is_recovery=0
[22:57:31] src/van.cc:251: the scheduler is connected to 2 workers and 2 servers
[22:57:31] src/van.cc:291: Barrier count for 7 : 1
[22:57:31] src/van.cc:281: S[8] is connected to others
[22:57:31] src/van.cc:281: S[10] is connected to others
[22:57:31] src/van.cc:291: Barrier count for 7 : 2
[22:57:31] src/van.cc:291: Barrier count for 7 : 3
[22:57:31] src/van.cc:281: W[9] is connected to others
[22:57:31] src/van.cc:281: W[11] is connected to others
[22:57:31] src/van.cc:291: Barrier count for 7 : 4
[22:57:31] src/van.cc:291: Barrier count for 7 : 5
[22:57:31] src/van.cc:291: Barrier count for 7 : 1
INFO:root:start with arguments Namespace(add_stn=False, batch_size=64, disp_batches=100, dtype='float32', gpus=None, kv_store='dist_sync', load_epoch=None, lr=0.05, lr_factor=0.1, lr_step_epochs='10', model_prefix=None, mom=0.9, monitor=0, network='lenet', num_classes=10, num_epochs=20, num_examples=60000, num_layers=None, optimizer='sgd', test_io=0, top_k=0, wd=0.0001)
INFO:root:start with arguments Namespace(add_stn=False, batch_size=64, disp_batches=100, dtype='float32', gpus=None, kv_store='dist_sync', load_epoch=None, lr=0.05, lr_factor=0.1, lr_step_epochs='10', model_prefix=None, mom=0.9, monitor=0, network='lenet', num_classes=10, num_epochs=20, num_examples=60000, num_layers=None, optimizer='sgd', test_io=0, top_k=0, wd=0.0001)
[22:57:33] src/van.cc:291: Barrier count for 4 : 1
[22:57:33] src/van.cc:291: Barrier count for 4 : 2
[22:57:33] src/van.cc:291: Barrier count for 4 : 1
[22:57:33] src/van.cc:291: Barrier count for 4 : 2
[22:57:33] src/van.cc:291: Barrier count for 4 : 1
[22:57:33] src/van.cc:291: Barrier count for 4 : 2
[22:57:33] src/van.cc:291: Barrier count for 4 : 1
[22:57:33] src/van.cc:291: Barrier count for 4 : 2
[22:57:33] src/van.cc:291: Barrier count for 4 : 1
[22:57:33] src/van.cc:291: Barrier count for 4 : 2
[22:57:33] src/van.cc:291: Barrier count for 4 : 1
[22:57:33] src/van.cc:291: Barrier count for 4 : 2
[22:57:33] src/van.cc:291: Barrier count for 4 : 1
[22:57:33] src/van.cc:291: Barrier count for 4 : 2
[22:57:33] src/van.cc:291: Barrier count for 4 : 1
[22:57:33] src/van.cc:291: Barrier count for 4 : 2
MKL Build:20170425
MKL Build:20170425
INFO:root:Epoch[0] Batch [100]	Speed: 721.37 samples/sec	accuracy=0.888304
INFO:root:Epoch[0] Batch [100]	Speed: 720.32 samples/sec	accuracy=0.885056
INFO:root:Epoch[0] Batch [200]	Speed: 727.53 samples/sec	accuracy=0.965625
INFO:root:Epoch[0] Batch [200]	Speed: 726.62 samples/sec	accuracy=0.966875

If cannot execute, It will not continue to execute,no matter how long you wait. the result as follows

export PS_VERBOSE=1

 [root@localhost image-classification]# python /root/mxnet/tools/launch.py -n 2 --launcher mpi -H hosts python train_mnist.py  --network lenet --kv-store dist_sync
2017-11-05 23:03:06,864 INFO Start 2 workers by mpirun
2017-11-05 23:03:06,869 INFO Start 2 servers by mpirun
[23:03:07] src/van.cc:75: Bind to role=scheduler, id=1, ip=192.168.137.206, port=9091, is_recovery=0
[23:03:08] src/van.cc:75: Bind to role=server, ip=192.168.137.206, port=46531, is_recovery=0
[23:03:08] src/van.cc:75: Bind to role=server, ip=192.168.137.206, port=58418, is_recovery=0
[23:03:08] src/van.cc:75: Bind to role=worker, ip=192.168.137.206, port=45193, is_recovery=0
[23:03:08] src/van.cc:75: Bind to role=worker, ip=192.168.137.206, port=59786, is_recovery=0
[23:03:08] src/van.cc:235: assign rank=9 to node role=worker, ip=192.168.137.206, port=45193, is_recovery=0
[23:03:08] src/van.cc:235: assign rank=8 to node role=server, ip=192.168.137.206, port=46531, is_recovery=0
[23:03:08] src/van.cc:235: assign rank=10 to node role=server, ip=192.168.137.206, port=58418, is_recovery=0
[23:03:08] src/van.cc:235: assign rank=11 to node role=worker, ip=192.168.137.206, port=59786, is_recovery=0
[23:03:08] src/van.cc:251: the scheduler is connected to 2 workers and 2 servers
[23:03:08] src/van.cc:291: Barrier count for 7 : 1
[23:03:08] src/van.cc:281: S[10] is connected to others
[23:03:08] src/van.cc:281: S[8] is connected to others
[23:03:08] src/van.cc:291: Barrier count for 7 : 2
[23:03:08] src/van.cc:281: W[11] is connected to others
[23:03:08] src/van.cc:281: W[9] is connected to others
[23:03:08] src/van.cc:291: Barrier count for 7 : 3
[23:03:08] src/van.cc:291: Barrier count for 7 : 4
export PS_VERBOSE=0

[root@localhost image-classification]# python /root/mxnet/tools/launch.py -n 2 --launcher mpi -H hosts python train_mnist.py  --network lenet --kv-store dist_sync
2017-11-05 23:15:32,049 INFO Start 2 workers by mpirun
2017-11-05 23:15:32,054 INFO Start 2 servers by mpirun

when use ssh, mxnet is still cannot run.

[root@localhost image-classification]# python /root/mxnet/tools/launch.py -n 2 --launcher ssh -H hosts python train_mnist2.py  --network lenet --kv-store dist_sync
INFO:root:start with arguments Namespace(add_stn=False, batch_size=64, disp_batches=100, dtype='float32', gpus=None, kv_store='dist_sync', load_epoch=None, lr=0.05, lr_factor=0.1, lr_step_epochs='10', model_prefix=None, mom=0.9, monitor=0, network='lenet', num_classes=10, num_epochs=20, num_examples=60000, num_layers=None, optimizer='sgd', test_io=0, top_k=0, wd=0.0001)
INFO:root:start with arguments Namespace(add_stn=False, batch_size=64, disp_batches=100, dtype='float32', gpus=None, kv_store='dist_sync', load_epoch=None, lr=0.05, lr_factor=0.1, lr_step_epochs='10', model_prefix=None, mom=0.9, monitor=0, network='lenet', num_classes=10, num_epochs=20, num_examples=60000, num_layers=None, optimizer='sgd', test_io=0, top_k=0, wd=0.0001)
Traceback (most recent call last):
  File "train_mnist2.py", line 96, in <module>
    fit.fit(args, sym, get_mnist_iter)
  File "/root/zhangzhe/MXNET/incubator-mxnet-master/example/image-classification/common/fit.py", line 211, in fit
    monitor            = monitor)
  File "/usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/module/base_module.py", line 394, in fit
    optimizer_params=optimizer_params)
  File "/usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/module/module.py", line 421, in init_optimizer
    **optimizer_params)
  File "/usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/optimizer.py", line 52, in create_optimizer
    **kwargs)
  File "/usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/optimizer.py", line 223, in __init__
    super(SGD, self).__init__(**kwargs)
TypeError: __init__() got an unexpected keyword argument 'multi_precision'
[07:45:20] /root/mxnet/dmlc-core/include/dmlc/./logging.h:300: [07:45:20] src/kvstore/././kvstore_dist_server.h:211: Check failed: !stored.is_none() init 0 first

Stack trace returned 7 entries:
[bt] (0) /usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/libmxnet.so(_ZN4dmlc15LogMessageFatalD1Ev+0x29) [0x7f074d579639]
[bt] (1) /usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/libmxnet.so(_ZN5mxnet7kvstore17KVStoreDistServer10DataHandleERKN2ps6KVMetaERKNS2_7KVPairsIfEEPNS2_8KVServerIfEE+0xb1a) [0x7f074e1591da]
[bt] (2) /usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/libmxnet.so(_ZN2ps8KVServerIfE7ProcessERKNS_7MessageE+0xf1) [0x7f074e14e491]
[bt] (3) /usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/libmxnet.so(_ZN2ps8Customer9ReceivingEv+0x592) [0x7f074e167bc2]
[bt] (4) /lib64/libstdc++.so.6(+0xb5230) [0x7f0747353230]
[bt] (5) /lib64/libpthread.so.0(+0x7dc5) [0x7f076508bdc5]
[bt] (6) /lib64/libc.so.6(clone+0x6d) [0x7f07646b173d]

terminate called after throwing an instance of 'dmlc::Error'
  what():  [07:45:20] src/kvstore/././kvstore_dist_server.h:211: Check failed: !stored.is_none() init 0 first

Stack trace returned 7 entries:
[bt] (0) /usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/libmxnet.so(_ZN4dmlc15LogMessageFatalD1Ev+0x29) [0x7f074d579639]
[bt] (1) /usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/libmxnet.so(_ZN5mxnet7kvstore17KVStoreDistServer10DataHandleERKN2ps6KVMetaERKNS2_7KVPairsIfEEPNS2_8KVServerIfEE+0xb1a) [0x7f074e1591da]
[bt] (2) /usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/libmxnet.so(_ZN2ps8KVServerIfE7ProcessERKNS_7MessageE+0xf1) [0x7f074e14e491]
[bt] (3) /usr/lib/python2.7/site-packages/mxnet-0.9.4-py2.7.egg/mxnet/libmxnet.so(_ZN2ps8Customer9ReceivingEv+0x592) [0x7f074e167bc2]
[bt] (4) /lib64/libstdc++.so.6(+0xb5230) [0x7f0747353230]
[bt] (5) /lib64/libpthread.so.0(+0x7dc5) [0x7f076508bdc5]
[bt] (6) /lib64/libc.so.6(clone+0x6d) [0x7f07646b173d]

bash: line 1:   925 Aborted                 python train_mnist2.py --network lenet --kv-store dist_sync
Exception in thread Thread-3:
Traceback (most recent call last):
  File "/usr/lib64/python2.7/threading.py", line 812, in __bootstrap_inner
    self.run()
  File "/usr/lib64/python2.7/threading.py", line 765, in run
    self.__target(*self.__args, **self.__kwargs)
  File "/root/mxnet/tools/../dmlc-core/tracker/dmlc_tracker/ssh.py", line 60, in run
    subprocess.check_call(prog, shell = True)
  File "/usr/lib64/python2.7/subprocess.py", line 542, in check_call
    raise CalledProcessError(retcode, cmd)
CalledProcessError: Command 'ssh -o StrictHostKeyChecking=no 192.168.137.207 -p 22 'export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/openmpi/lib:; export DMLC_SERVER_ID=1; export DMLC_PS_ROOT_URI=192.168.137.206; export DMLC_ROLE=server; export DMLC_PS_ROOT_PORT=9101; export DMLC_NUM_WORKER=2; export DMLC_NUM_SERVER=2; cd /root/zhangzhe/MXNET/incubator-mxnet-master/example/image-classification/; python train_mnist2.py --network lenet --kv-store dist_sync'' returned non-zero exit status 134

Thanks


#2

Did you make sure you terminate all the python processes on these hosts before rerunning?


#3

i kill all process of python


#4

Do you figure out why? I have similar issues.

I tried train mlp on two machines. It just stucked at :Barrier count for 7 : 3" when using ssh. When using mpi, it stucked at “Bind to …”

CPU usage was low, so I guess the training was not initialized successfully.