YOLO from "scratch"

I am new to the MXNet library.
While experienced with neural networks, I am new to object detection.

MXNet has some implementations for single shot detection (SSD) network, and the similar but architecturally distinct you only look once (YOLO) network.

After reading the original and improved YOLO papers (https://pjreddie.com/publications/) I am no closer to understanding how to implement the network, how to training the different bounding boxes, how to define them, etc.

Thus I was wondering if someone could make a demonstration of how to implement YOLO from “scratch” (e.g. use of the gluon.nn layers are acceptable, but not the built in bounding boxes, predictions etc)

Here is my attempt, please help :slight_smile:

Yolo from Scratch

import

import mxnet as mx
from mxnet import nd, gluon
from mxnet.gluon import nn
import random

Variables

Dataset

use_gpu = False
ctx = mx.gpu() if use_gpu else mx.cpu()

# define the attributes of an image in our dataset
image_length = 105  # num pixels along x
image_height = 10   # num pixels along y
image_channels = 1  # num channels, e.g. 3 is (r,g,b) 1 is just gray scale

# define dataset attributes
number_of_images = 100               # images in the dataset
number_of_classes = 4                # number of classes of items that appear in this data set
max_number_of_objects = 3            # maximum number of objects that will appear in a singular image

construct dataset

data_set = []
for i in range(number_of_images):
    
    # generate a random image
    img = nd.random_uniform(shape=(image_channels, image_height, image_length), ctx=ctx)
    record_data = {'image': img, 'objects': []}
    
    # give each img a random number of objects
    # NOTE: we assume that the object will always take the entire height of the image
    num_objs_in_img = random.randint(1, max_number_of_objects+1)
    # for each object
    for j in range(num_objs_in_img):
        
        # start and stop of the image (again we assume it has full height)
        start_stop = sorted([random.randint(0, image_length), random.randint(0, image_length)])
        
        # give the object a random class
        obj_class = random.randint(0, number_of_classes)
        
        
        record_data["objects"].append({
            "start": start_stop[0],
            "stop": start_stop[1],
            "class": obj_class
        })
    
    data_set.append(record_data)

Network

learning_rate = 0.0001
beta_1 = 0.05


# number of "grid cells"
grid_cells = 5
# again, we assume that all our objects are of max height, thus
# our grid cells are simply boxes along the x axis

"""
  |  __Grid_1__ |  __Grid_2__ |  __Grid_3__ |  __Grid_4__ |
  |-------------|-------------|-------------|-------------|
i | x x x x x x | x x x x x x | x x x x x x | x x x x x x |
m | x x x x x x | x x x x x x | x x x x x x | x x x x x x |
g | x x x x x x | x x x x x x | x x x x x x | x x x x x x |
  |-------------|-------------|-------------|-------------|
h |---------------------------------------------------img_w
"""

# number of predictions per grid cell
bounding_boxes_per_grid_cell = 10

# each bounding box normally predicts, x, width, y, height, class
# however, we assume y=0 and height=max_height always, so each bounding
# box predicts three things, x, width, class

output_cell_dim = bounding_boxes_per_grid_cell * 3 + number_of_classes


net = nn.Sequential()
with net.name_scope():
    # assume our "data" img is the output of the base network
    # thus we are only defining the actual detection part
    # therefore we will force this conv to output the 
    # 5 "grid cells" with the correct number of filters
    grid_div = int(record_image_length / grid_cells)
    kernel = (record_image_height,  grid_div)
    stride=(1, grid_div)
    net.add(nn.Conv2D(channels=int(output_cell_dim), kernel_size=kernel, strides=stride))


net.initialize(mx.init.Normal(0.02), ctx=ctx)
net_trainer = gluon.Trainer(net.collect_params(), 'adam', {"learning_rate": learning_rate, "beta1": beta_1})
net

Confirm Output Size

padding = (0,0)
dilation = (1,1)
kernel_size = (record_image_height, int(record_image_length / grid_cells) )
stride=(1,int(record_image_length / grid_cells))
width=record_image_length
height = record_image_height
from math import floor
out_width = floor((width+2*padding[1]-dilation[1]*(kernel_size[1]-1)-1)/stride[1])+1
out_height = floor((height+2*padding[0]-dilation[0]*(kernel_size[0]-1)-1)/stride[0])+1
print(out_height," x ", out_width)


test=nd.array([data_set[0]["image"].asnumpy()], ctx=ctx)
net(test).reshape((1,34,5)).shape