Hi, out of curiosity I modified your code to run on gpu and tested it on a nvidia -P100, here’s the modified code (I used mx.gpu() for mxnet and *.cuda() wherever I thought was necessary for pytorch (am developing code on mxnet 99.9% of the time - so no pytorch expertise here) :
import torch
from torch import nn as ptnn
from torch.autograd import Variable
import mxnet as mx
from mxnet.gluon import nn as mxnn
from mxnet import nd, initializer
from enum import IntEnum
from time import time
use_cuda = torch.cuda.is_available()
fmt = ' {:<14} {:<15} {:<12} {:>5}'
mx_ctx = mx.gpu()
class Framework(IntEnum):
PYTORCH = 1
MXNET = 2
def get_mxnet_network():
net = mxnn.HybridSequential()
with net.name_scope():
net.add(mxnn.Dense(256, activation="relu"))
net.add(mxnn.Dense(128, activation="relu"))
net.add(mxnn.Dense(2))
net.initialize(init=initializer.Zero(),ctx = mx_ctx)
return net
def pytorch_weights_init(m):
if isinstance(m, ptnn.Linear):
ptnn.init.uniform_(m.weight.data, 0, 0)
ptnn.init.uniform_(m.bias.data, 0, 0)
def get_pytorch_network():
net = ptnn.Sequential()
net.add_module('dense1', ptnn.Linear(1, 256))
net.add_module('relu1', ptnn.ReLU())
net.add_module('dense2', ptnn.Linear(256, 128))
net.add_module('relu2', ptnn.ReLU())
net.add_module('dense3', ptnn.Linear(128, 2))
net.apply(pytorch_weights_init)
return net.cuda()
# Wait for computation to finish to make profiling more accurate
def block(framework):
if framework == Framework.PYTORCH:
if use_cuda:
torch.cuda.synchronize()
elif framework == Framework.MXNET:
mx.nd.waitall()
def bench(net, x, framework):
block(framework)
start = time()
for i in range(1000):
y = net(x)
block(framework)
return time() - start
def report(framework, paradigm, precision, value=None):
t = '%i' % (value * 1000) if value else '---'
print(fmt.format(framework, paradigm, '%i bit' % precision, t))
# Input matrices
mx_x_32 = nd.ones((512, 1), mx_ctx)
mx_x_16 = mx_x_32.astype('float16')
pt_x_32 = Variable(torch.ones((512, 1))).cuda()
pt_x_16 = pt_x_32.half()
print()
print(' Device:', 'GPU' if use_cuda else 'CPU')
print('----------------------------------------------------')
print(fmt.format('Framework', 'Paradigm', 'Precision', 'Time'))
print('====================================================')
mx_net = get_mxnet_network()
report('MXNet', 'imperative', 32, bench(mx_net, mx_x_32, Framework.MXNET))
mx_net.cast('float16')
report('MXNet', 'imperative', 16, bench(mx_net, mx_x_16, Framework.MXNET))
mx_net.cast('float32')
mx_net.hybridize()
report('MXNet', 'symbolic', 32, bench(mx_net, mx_x_32, Framework.MXNET))
mx_net.cast('float16')
report('MXNet', 'symbolic', 16, bench(mx_net, mx_x_16, Framework.MXNET))
pt_net = get_pytorch_network()
report('PyTorch', 'imperative', 32, bench(pt_net, pt_x_32, Framework.PYTORCH))
# PyTorch half precision isn't supported on a CPU
pt_16 = bench(pt_net.half(), pt_x_16, Framework.PYTORCH) if use_cuda else None
report('PyTorch', 'imperative', 16, pt_16)
print('----------------------------------------------------')
and here’s the output:
dia021@b027:~/Projects/benchmark> python mxnet_vs_pytorch_benchmark.py
Device: GPU
----------------------------------------------------
Framework Paradigm Precision Time
====================================================
MXNet imperative 32 bit 654
MXNet imperative 16 bit 486
MXNet symbolic 32 bit 199
MXNet symbolic 16 bit 256
PyTorch imperative 32 bit 143
PyTorch imperative 16 bit 135
----------------------------------------------------
I cannot really understand if the benchmark is 100% reliable, or how it will evolve with a more complicated network, just reporting run output. If there is time I may post something more complicated in the future.