Implementing YOLO Object Detection from Scratch
YOLO (You Only Look Once) is one of the most popular object detection algorithms due to its speed and accuracy. In this tutorial, we'll implement a simplified version of YOLO from scratch using PyTorch.
Understanding YOLO
YOLO approaches object detection as a regression problem rather than a classification problem. It divides the image into a grid and predicts bounding boxes and class probabilities for each grid cell.

The YOLO Architecture
The YOLO architecture consists of:
Implementation Steps
Step 1: Define the Model Architecture
Let's start by defining a simplified YOLO model:
import torch
import torch.nn as nn
import torch.nn.functional as F
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1):
super().__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, kernel_size//2, bias=False)
self.bn = nn.BatchNorm2d(out_channels)
self.activation = nn.LeakyReLU(0.1)
def forward(self, x):
return self.activation(self.bn(self.conv(x)))
class YOLOv3(nn.Module):
def __init__(self, num_classes=80):
super().__init__()
self.num_classes = num_classes
# Backbone
self.backbone = nn.Sequential(
ConvBlock(3, 32),
ConvBlock(32, 64, stride=2),
ConvBlock(64, 128),
ConvBlock(128, 256, stride=2),
ConvBlock(256, 512),
ConvBlock(512, 1024, stride=2),
ConvBlock(1024, 512),
ConvBlock(512, 1024)
)
# Detection head
self.head = nn.Conv2d(1024, 3 * (5 + num_classes), kernel_size=1)
def forward(self, x):
features = self.backbone(x)
output = self.head(features)
# Reshape output for interpretation
batch_size, _, grid_size, _ = output.shape
output = output.view(batch_size, 3, 5 + self.num_classes, grid_size, grid_size)
output = output.permute(0, 1, 3, 4, 2)
return output
Step 2: Implement the Loss Function
YOLO uses a complex loss function that combines:
- Bounding box coordinate loss
- Object confidence loss
- Class prediction loss
def yolo_loss(predictions, targets, lambda_coord=5.0, lambda_noobj=0.5):
batch_size = predictions.size(0)
grid_size = predictions.size(2)
# Extract components from predictions
pred_boxes = predictions[..., :4] # x, y, w, h
pred_conf = predictions[..., 4] # confidence
pred_cls = predictions[..., 5:] # class probabilities
# Extract components from targets
target_boxes = targets[..., :4]
target_conf = targets[..., 4]
target_cls = targets[..., 5:]
# Object mask - cells with objects
obj_mask = target_conf > 0
noobj_mask = target_conf == 0
# Coordinate loss
box_loss = F.mse_loss(
pred_boxes[obj_mask],
target_boxes[obj_mask],
reduction='sum'
)
# Confidence loss
conf_obj_loss = F.mse_loss(
pred_conf[obj_mask],
target_conf[obj_mask],
reduction='sum'
)
conf_noobj_loss = F.mse_loss(
pred_conf[noobj_mask],
target_conf[noobj_mask],
reduction='sum'
)
# Class loss
cls_loss = F.binary_cross_entropy_with_logits(
pred_cls[obj_mask],
target_cls[obj_mask],
reduction='sum'
)
# Combine losses
total_loss = (
lambda_coord * box_loss +
conf_obj_loss +
lambda_noobj * conf_noobj_loss +
cls_loss
) / batch_size
return total_loss
Step 3: Data Preparation
To train YOLO, we need to prepare our data in the right format:
from torch.utils.data import Dataset
import cv2
import numpy as np
class YOLODataset(Dataset):
def __init__(self, image_paths, annotations, grid_size=13, num_classes=80, transform=None):
self.image_paths = image_paths
self.annotations = annotations
self.grid_size = grid_size
self.num_classes = num_classes
self.transform = transform
def __len__(self):
return len(self.image_paths)
def __getitem__(self, idx):
img_path = self.image_paths[idx]
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# Get annotations
boxes = self.annotations[idx]
if self.transform:
img, boxes = self.transform(img, boxes)
# Convert to target format
target = self._create_target(boxes)
return img, target
def _create_target(self, boxes):
target = torch.zeros(3, self.grid_size, self.grid_size, 5 + self.num_classes)
for box in boxes:
# Extract box info
x, y, w, h, cls_id = box
# Convert to grid coordinates
grid_x = int(x * self.grid_size)
grid_y = int(y * self.grid_size)
# Find best anchor box
anchor_idx = 0 # Simplified - would normally find best matching anchor
# Set target values
target[anchor_idx, grid_y, grid_x, 0] = x * self.grid_size - grid_x # x offset
target[anchor_idx, grid_y, grid_x, 1] = y * self.grid_size - grid_y # y offset
target[anchor_idx, grid_y, grid_x, 2] = w # width
target[anchor_idx, grid_y, grid_x, 3] = h # height
target[anchor_idx, grid_y, grid_x, 4] = 1 # confidence
target[anchor_idx, grid_y, grid_x, 5 + int(cls_id)] = 1 # class
return target
Step 4: Training Loop
Now let's implement the training loop:
def train_yolo(model, dataloader, optimizer, epochs=100):
model.train()
for epoch in range(epochs):
epoch_loss = 0
for batch_idx, (images, targets) in enumerate(dataloader):
images = images.to(device)
targets = targets.to(device)
# Forward pass
predictions = model(images)
# Calculate loss
loss = yolo_loss(predictions, targets)
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.item()
if batch_idx % 10 == 0:
print(f"Epoch {epoch}, Batch {batch_idx}, Loss: {loss.item():.6f}")
print(f"Epoch {epoch}, Average Loss: {epoch_loss / len(dataloader):.6f}")
Step 5: Inference and Non-Maximum Suppression
After training, we need to process the model's output to get usable detections:
def process_predictions(predictions, confidence_threshold=0.5, nms_threshold=0.4):
batch_size = predictions.shape[0]
grid_size = predictions.shape[2]
num_classes = predictions.shape[-1] - 5
all_boxes = []
for b in range(batch_size):
boxes = []
# Process grid cells
for i in range(grid_size):
for j in range(grid_size):
for a in range(3): # 3 anchor boxes
# Get confidence
confidence = predictions[b, a, i, j, 4]
if confidence > confidence_threshold:
# Get class probabilities
class_scores = predictions[b, a, i, j, 5:]
class_id = torch.argmax(class_scores).item()
class_score = class_scores[class_id].item()
# Get box coordinates
x = (predictions[b, a, i, j, 0] + j) / grid_size
y = (predictions[b, a, i, j, 1] + i) / grid_size
w = predictions[b, a, i, j, 2]
h = predictions[b, a, i, j, 3]
# Convert to corner format
x1 = x - w/2
y1 = y - h/2
x2 = x + w/2
y2 = y + h/2
boxes.append([x1, y1, x2, y2, confidence * class_score, class_id])
# Apply non-maximum suppression
boxes = torch.tensor(boxes)
if boxes.shape[0] > 0:
# Sort by confidence
scores = boxes[:, 4]
_, indices = torch.sort(scores, descending=True)
boxes = boxes[indices]
# Apply NMS
keep_boxes = []
while boxes.shape[0] > 0:
keep_boxes.append(boxes[0])
if boxes.shape[0] == 1:
break
# Calculate IoU
ious = calculate_iou(boxes[0, :4], boxes[1:, :4])
# Filter boxes with IoU > threshold
boxes = boxes[1:][ious <= nms_threshold]
all_boxes.append(torch.stack(keep_boxes))
else:
all_boxes.append(torch.tensor([]))
return all_boxes
def calculate_iou(box, boxes):
# Calculate intersection areas
x1 = torch.max(box[0], boxes[:, 0])
y1 = torch.max(box[1], boxes[:, 1])
x2 = torch.min(box[2], boxes[:, 2])
y2 = torch.min(box[3], boxes[:, 3])
intersection = torch.clamp(x2 - x1, min=0) * torch.clamp(y2 - y1, min=0)
# Calculate union areas
box_area = (box[2] - box[0]) * (box[3] - box[1])
boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
union = box_area + boxes_area - intersection
return intersection / union
Conclusion
This tutorial covered the basics of implementing YOLO from scratch. In a real-world scenario, you would:
YOLO has evolved through several versions (v1 to v8), each improving upon the previous. The implementation shown here is simplified but captures the core concepts of the YOLO architecture.
For production use, consider using pre-trained models from libraries like PyTorch's torchvision, Ultralytics' YOLOv5/v8, or Darknet.