I am new to the MXNet library.
While experienced with neural networks, I am new to object detection.
MXNet has some implementations for single shot detection (SSD) network, and the similar but architecturally distinct you only look once (YOLO) network.
After reading the original and improved YOLO papers (https://pjreddie.com/publications/) I am no closer to understanding how to implement the network, how to training the different bounding boxes, how to define them, etc.
Thus I was wondering if someone could make a demonstration of how to implement YOLO from “scratch” (e.g. use of the gluon.nn layers are acceptable, but not the built in bounding boxes, predictions etc)
Here is my attempt, please help
Yolo from Scratch
import
import mxnet as mx
from mxnet import nd, gluon
from mxnet.gluon import nn
import random
Variables
Dataset
use_gpu = False
ctx = mx.gpu() if use_gpu else mx.cpu()
# define the attributes of an image in our dataset
image_length = 105 # num pixels along x
image_height = 10 # num pixels along y
image_channels = 1 # num channels, e.g. 3 is (r,g,b) 1 is just gray scale
# define dataset attributes
number_of_images = 100 # images in the dataset
number_of_classes = 4 # number of classes of items that appear in this data set
max_number_of_objects = 3 # maximum number of objects that will appear in a singular image
construct dataset
data_set = []
for i in range(number_of_images):
# generate a random image
img = nd.random_uniform(shape=(image_channels, image_height, image_length), ctx=ctx)
record_data = {'image': img, 'objects': []}
# give each img a random number of objects
# NOTE: we assume that the object will always take the entire height of the image
num_objs_in_img = random.randint(1, max_number_of_objects+1)
# for each object
for j in range(num_objs_in_img):
# start and stop of the image (again we assume it has full height)
start_stop = sorted([random.randint(0, image_length), random.randint(0, image_length)])
# give the object a random class
obj_class = random.randint(0, number_of_classes)
record_data["objects"].append({
"start": start_stop[0],
"stop": start_stop[1],
"class": obj_class
})
data_set.append(record_data)
Network
learning_rate = 0.0001
beta_1 = 0.05
# number of "grid cells"
grid_cells = 5
# again, we assume that all our objects are of max height, thus
# our grid cells are simply boxes along the x axis
"""
| __Grid_1__ | __Grid_2__ | __Grid_3__ | __Grid_4__ |
|-------------|-------------|-------------|-------------|
i | x x x x x x | x x x x x x | x x x x x x | x x x x x x |
m | x x x x x x | x x x x x x | x x x x x x | x x x x x x |
g | x x x x x x | x x x x x x | x x x x x x | x x x x x x |
|-------------|-------------|-------------|-------------|
h |---------------------------------------------------img_w
"""
# number of predictions per grid cell
bounding_boxes_per_grid_cell = 10
# each bounding box normally predicts, x, width, y, height, class
# however, we assume y=0 and height=max_height always, so each bounding
# box predicts three things, x, width, class
output_cell_dim = bounding_boxes_per_grid_cell * 3 + number_of_classes
net = nn.Sequential()
with net.name_scope():
# assume our "data" img is the output of the base network
# thus we are only defining the actual detection part
# therefore we will force this conv to output the
# 5 "grid cells" with the correct number of filters
grid_div = int(record_image_length / grid_cells)
kernel = (record_image_height, grid_div)
stride=(1, grid_div)
net.add(nn.Conv2D(channels=int(output_cell_dim), kernel_size=kernel, strides=stride))
net.initialize(mx.init.Normal(0.02), ctx=ctx)
net_trainer = gluon.Trainer(net.collect_params(), 'adam', {"learning_rate": learning_rate, "beta1": beta_1})
net
Confirm Output Size
padding = (0,0)
dilation = (1,1)
kernel_size = (record_image_height, int(record_image_length / grid_cells) )
stride=(1,int(record_image_length / grid_cells))
width=record_image_length
height = record_image_height
from math import floor
out_width = floor((width+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
out_height = floor((height+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
print(out_height," x ", out_width)
test=nd.array([data_set[0]["image"].asnumpy()], ctx=ctx)
net(test).reshape((1,34,5)).shape