Tensorrt speedup yolo3_darknet53 failed

Hello friends.

I am trying use tensorrt to speedup gluoncv yolov3_darknet53 following Optimizing Deep Learning Computation Graphs with TensorRT.
I replaced resnet18 with yolov3_darknet53, but when building the subgraph, the program broke down. Its output said Finding ancestor failed. There is probably a loop in the graph.

the full outputs are:

Building TensorRT engine
[14:03:02] src/operator/subgraph/build_subgraph.cc:691: start to execute TensorRT.
[14:03:02] src/operator/subgraph/build_subgraph.cc:300: Found a cycle when BFS from node darknetv30_darknetbasicblockv31__plus0. Excluding nodes darknetv30_darknetbasicblockv32__plus0, and retrying
Traceback (most recent call last):
  File "./yolo3_trt.py", line 43, in <module>
    trt_sym = sym.get_backend_symbol('TensorRT')
  File "/opt/mxnet/python/mxnet/symbol/symbol.py", line 2564, in get_backend_symbol
    check_call(_LIB.MXGenBackendSubgraph(self.handle, c_str(backend), ctypes.byref(out)))
  File "/opt/mxnet/python/mxnet/base.py", line 252, in check_call
    raise MXNetError(py_str(_LIB.MXGetLastError()))
mxnet.base.MXNetError: [14:03:02] src/operator/subgraph/build_subgraph.cc:209: Check failed: count < indexed_graph.num_nodes() (727 vs. 727) : Finding ancestor failed. There is probably a loop in the graph
Stack trace:
  [bt] (0) /usr/local/lib/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x43) [0x7fc82c58dd33]
  [bt] (1) /usr/local/lib/libmxnet.so(mxnet::op::sg::LabelSubgraph(nnvm::Graph const&, std::shared_ptr<mxnet::op::SubgraphSelectorV2>, int, unsigned long, std::vector<std::shared_ptr<mxnet::op::BiDirectedNode>, std::allocator<std::shared_ptr<mxnet::op::BiDirectedNode> > > const&, std::vector<mxnet::op::BiDirectedNode*, std::allocator<mxnet::op::BiDirectedNode*> >*, std::unordered_set<mxnet::op::BiDirectedNode const*, std::hash<mxnet::op::BiDirectedNode const*>, std::equal_to<mxnet::op::BiDirectedNode const*>, std::allocator<mxnet::op::BiDirectedNode const*> >*)+0x17a0) [0x7fc82e384010]
  [bt] (2) /usr/local/lib/libmxnet.so(mxnet::op::sg::PreSelectSubgraphNodes(nnvm::Graph const&, std::shared_ptr<mxnet::op::SubgraphSelectorV2>, int, unsigned long, std::vector<std::shared_ptr<mxnet::op::BiDirectedNode>, std::allocator<std::shared_ptr<mxnet::op::BiDirectedNode> > > const&, std::vector<mxnet::op::BiDirectedNode*, std::allocator<mxnet::op::BiDirectedNode*> >*)+0x167) [0x7fc82e385517]
  [bt] (3) /usr/local/lib/libmxnet.so(mxnet::op::sg::SelectSubgraphNodes(nnvm::Graph*, std::shared_ptr<mxnet::op::SubgraphSelectorV2>, std::vector<std::shared_ptr<mxnet::op::BiDirectedNode>, std::allocator<std::shared_ptr<mxnet::op::BiDirectedNode> > > const&, std::vector<std::vector<mxnet::op::BiDirectedNode*, std::allocator<mxnet::op::BiDirectedNode*> >, std::allocator<std::vector<mxnet::op::BiDirectedNode*, std::allocator<mxnet::op::BiDirectedNode*> > > >*, std::vector<std::shared_ptr<mxnet::op::SubgraphSelectorV2>, std::allocator<std::shared_ptr<mxnet::op::SubgraphSelectorV2> > >*, mxnet::op::BiDirectedNode const*, unsigned long, unsigned long*)+0x10f) [0x7fc82e385c4f]
  [bt] (4) /usr/local/lib/libmxnet.so(mxnet::op::sg::FindSubgraphs(nnvm::Graph*, mxnet::op::SubgraphProperty const&, std::vector<std::shared_ptr<mxnet::op::BiDirectedNode>, std::allocator<std::shared_ptr<mxnet::op::BiDirectedNode> > > const&, std::vector<std::vector<mxnet::op::BiDirectedNode*, std::allocator<mxnet::op::BiDirectedNode*> >, std::allocator<std::vector<mxnet::op::BiDirectedNode*, std::allocator<mxnet::op::BiDirectedNode*> > > >*, std::vector<std::shared_ptr<mxnet::op::SubgraphSelectorV2>, std::allocator<std::shared_ptr<mxnet::op::SubgraphSelectorV2> > >*)+0x317) [0x7fc82e386897]
  [bt] (5) /usr/local/lib/libmxnet.so(mxnet::op::BuildSubgraph(nnvm::Graph&&)+0x482) [0x7fc82e388872]
  [bt] (6) /usr/local/lib/libmxnet.so(std::_Function_handler<nnvm::Graph (nnvm::Graph), nnvm::Graph (*)(nnvm::Graph&&)>::_M_invoke(std::_Any_data const&, nnvm::Graph&&)+0x20) [0x7fc82c8d7940]
  [bt] (7) /usr/local/lib/libmxnet.so(nnvm::ApplyPasses(nnvm::Graph, std::vector<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > > > const&)+0x1171) [0x7fc831f255a1]
  [bt] (8) /usr/local/lib/libmxnet.so(nnvm::ApplyPass(nnvm::Graph, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&)+0xbe) [0x7fc82ecdd6ee]

And the result should be easily reproduced by the script:

from gluoncv import model_zoo, data, utils
from matplotlib import pyplot as plt
from gluoncv.utils import export_block
import mxnet as mx
from mxnet.contrib import onnx as onnx_mxnet
import numpy as np
import time

OUTPUT = "./"
DATA = "./cat.png"
SIZE = 320
MODEL = "yolo3_darknet53_coco"
INPUT_SHAPE = (1, 3, SIZE, SIZE)
net = model_zoo.get_model('yolo3_darknet53_coco', pretrained=True)
net.hybridize()
x, img = data.transforms.presets.yolo.load_test("./cat.png", short=SIZE)
class_IDs, scores, bounding_boxs = net(x)
net.export("yolo3_darknet53_coco")

sym, arg_params, aux_params = mx.model.load_checkpoint(MODEL, 0)

# Create sample input
batch_shape = INPUT_SHAPE
input = mx.nd.zeros(batch_shape)


print('Building TensorRT engine')
trt_sym = sym.get_backend_symbol('TensorRT')
arg_params, aux_params = mx.contrib.tensorrt.init_tensorrt_params(trt_sym, arg_params, aux_params)
mx.contrib.tensorrt.set_use_fp16(True)
executor = trt_sym.simple_bind(ctx=mx.gpu(), data=batch_shape,
                               grad_req='null', force_rebind=True)
executor.copy_params_from(arg_params, aux_params)

#Warmup
print('Warming up TensorRT')
for i in range(0, 10):
    y_gen = executor.forward(is_train=False, data=input)
    y_gen[0].wait_to_read()

# Timing
print('Starting TensorRT timed run')
start = time.process_time()
for i in range(0, 300):
    y_gen = executor.forward(is_train=False, data=input)
    y_gen[0].wait_to_read()
end = time.time()
print(time.process_time() - start)

I have tried with different version of mxnet. when optimizing the yolov3_mobilev1, the code works fine. I suppose that there exist some bug/flaw in the buildsubgraph procedure or the gluoncv model implementation.

So, if I still want to work with tensorrt integration, what should I do to fix the problem. Are there some ways like manually modifying gluoncv yolo3 source code or the exported symbol.json file, in which I could make all things right´╝č