Performance of Symbol vs. NDArray vs. PyTorch

feevos · July 30, 2018, 4:37am

Hi, out of curiosity I modified your code to run on gpu and tested it on a nvidia -P100, here’s the modified code (I used mx.gpu() for mxnet and *.cuda() wherever I thought was necessary for pytorch (am developing code on mxnet 99.9% of the time - so no pytorch expertise here) :

import torch
from torch import nn as ptnn
from torch.autograd import Variable
import mxnet as mx
from mxnet.gluon import nn as mxnn
from mxnet import nd, initializer
from enum import IntEnum
from time import time

use_cuda = torch.cuda.is_available()
fmt = ' {:<14} {:<15} {:<12} {:>5}'
mx_ctx = mx.gpu()

class Framework(IntEnum):
    PYTORCH = 1
    MXNET = 2


def get_mxnet_network():
    net = mxnn.HybridSequential()
    with net.name_scope():
        net.add(mxnn.Dense(256, activation="relu"))
        net.add(mxnn.Dense(128, activation="relu"))
        net.add(mxnn.Dense(2))
    net.initialize(init=initializer.Zero(),ctx = mx_ctx)
    return net


def pytorch_weights_init(m):
    if isinstance(m, ptnn.Linear):
        ptnn.init.uniform_(m.weight.data, 0, 0)
        ptnn.init.uniform_(m.bias.data, 0, 0)


def get_pytorch_network():
    net = ptnn.Sequential()
    net.add_module('dense1', ptnn.Linear(1, 256))
    net.add_module('relu1', ptnn.ReLU())
    net.add_module('dense2', ptnn.Linear(256, 128))
    net.add_module('relu2', ptnn.ReLU())
    net.add_module('dense3', ptnn.Linear(128, 2))
    net.apply(pytorch_weights_init)
    return net.cuda()


# Wait for computation to finish to make profiling more accurate
def block(framework):
    if framework == Framework.PYTORCH:
        if use_cuda:
            torch.cuda.synchronize()
    elif framework == Framework.MXNET:
        mx.nd.waitall()


def bench(net, x, framework):
    block(framework)
    start = time()
    for i in range(1000):
        y = net(x)
    block(framework)
    return time() - start


def report(framework, paradigm, precision, value=None):
    t = '%i' % (value * 1000) if value else '---'
    print(fmt.format(framework, paradigm, '%i bit' % precision, t))


# Input matrices
mx_x_32 = nd.ones((512, 1), mx_ctx)
mx_x_16 = mx_x_32.astype('float16')
pt_x_32 = Variable(torch.ones((512, 1))).cuda()
pt_x_16 = pt_x_32.half()


print()
print(' Device:', 'GPU' if use_cuda else 'CPU')
print('----------------------------------------------------')
print(fmt.format('Framework', 'Paradigm', 'Precision', 'Time'))
print('====================================================')
mx_net = get_mxnet_network()
report('MXNet', 'imperative', 32, bench(mx_net, mx_x_32, Framework.MXNET))
mx_net.cast('float16')
report('MXNet', 'imperative', 16, bench(mx_net, mx_x_16, Framework.MXNET))
mx_net.cast('float32')
mx_net.hybridize()
report('MXNet', 'symbolic', 32, bench(mx_net, mx_x_32, Framework.MXNET))
mx_net.cast('float16')
report('MXNet', 'symbolic', 16, bench(mx_net, mx_x_16, Framework.MXNET))
pt_net = get_pytorch_network()
report('PyTorch', 'imperative', 32, bench(pt_net, pt_x_32, Framework.PYTORCH))

# PyTorch half precision isn't supported on a CPU
pt_16 = bench(pt_net.half(), pt_x_16, Framework.PYTORCH) if use_cuda else None
report('PyTorch', 'imperative', 16, pt_16)

print('----------------------------------------------------')

and here’s the output:

dia021@b027:~/Projects/benchmark> python mxnet_vs_pytorch_benchmark.py 

 Device: GPU
----------------------------------------------------
 Framework      Paradigm        Precision     Time
====================================================
 MXNet          imperative      32 bit         654
 MXNet          imperative      16 bit         486
 MXNet          symbolic        32 bit         199
 MXNet          symbolic        16 bit         256
 PyTorch        imperative      32 bit         143
 PyTorch        imperative      16 bit         135
----------------------------------------------------

I cannot really understand if the benchmark is 100% reliable, or how it will evolve with a more complicated network, just reporting run output. If there is time I may post something more complicated in the future.

Topic		Replies	Views
Mxnet 1.3.1: speed/performance differences between the mxnet gluon and module/symbol APIs of at least a factor of 2 Performance	11	1380	February 27, 2019
Hybrid training speed is 20% slower than pytorch Performance	5	1328	January 11, 2019
MXNet vs Pytorch Benchmark Performance	3	2254	May 27, 2019
Homework Q4	3	374	January 29, 2019
Distributed Gluon HybridBlock is much much slower than Symbol	2	866	December 20, 2017

Performance of Symbol vs. NDArray vs. PyTorch

Related Topics