You Only Look Once

You Only Look Once

computer-vision
deep-learning
python
yolo
ultralytics
Author

Martin Olmos

Published

January 11, 2026

You Only Look Once (YOLO) is series of open-source, real-time object detection models that can identify and classify multiple objects within an image or video frame. Unlike traditional object detection methods that rely on region proposal networks, YOLO treats object detection as a single regression problem, directly predicting bounding boxes and class probabilities from the entire image in one evaluation.

The Ultralytics YOLOv8 series supports various tasks, including object detection, pose estimation, segmentation, and classification. It is known for its speed and accuracy, making it suitable for real-time applications such as surveillance, autonomous driving, and robotics.

Here is the code to run a YOLOv8 model using the Ultralytics library in Python for object detection and pose estimation:

Code
from ultralytics import YOLO
import cv2
import math 
import torch


CAM_INDEX = 0  # change if you have multiple cameras
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# model
model = YOLO("yolo-Weights/yolov8n.pt")

# Optional: move model to GPU (if available)
try:
    model.to(device)
except Exception:
    pass  # older ultralytics handles device per-predict call


# start webcam
cap = cv2.VideoCapture(CAM_INDEX)
cap.set(3, 640)
cap.set(4, 480)

# object classes
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"
              ]

# Video output settings
output_path = "output_detection.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = 30
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

while True:
    success, img = cap.read()
    if not success:
        break
    results = model(img, stream=True, device="cuda")

    # coordinates
    for r in results:
        boxes = r.boxes

        for box in boxes:
            # bounding box
            x1, y1, x2, y2 = box.xyxy[0]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values

            # put box in cam
            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)

            # confidence
            confidence = math.ceil((box.conf[0]*100))/100
            print("Confidence --->",confidence)

            # class name
            cls = int(box.cls[0])
            print("Class name -->", classNames[cls])

            # object details
            org = [x1, y1]
            font = cv2.FONT_HERSHEY_SIMPLEX
            fontScale = 1
            color = (255, 0, 0)
            thickness = 2

            cv2.putText(img, classNames[cls], org, font, fontScale, color, thickness)

    # Write frame to video file
    out.write(img)
    
    cv2.imshow('Webcam', img)
    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
out.release()  # Release the video writer
cv2.destroyAllWindows()
print(f"Video saved to {output_path}")

Code
# Realtime pose estimation with YOLOv8 (OpenCV window)
import cv2, torch, time
from ultralytics import YOLO

CAM_INDEX = 0  # change if you have multiple cameras
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = YOLO('yolov8n-pose.pt')  # or 'yolov8s-pose.pt'
# Optional: move model to GPU (if available)
try:
    model.to(device)
except Exception:
    pass  # older ultralytics handles device per-predict call

cap = cv2.VideoCapture(CAM_INDEX)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

# Video output settings
output_path = "output_pose.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = 30
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

prev_t = time.time()
while True:
    ok, frame = cap.read()
    if not ok:
        break

    results = model.predict(
        frame,
        device=device,
        imgsz=640,
        conf=0.5,
        half=(device == 'cuda'),
        verbose=False
    )
    annotated = results[0].plot()

    # FPS overlay
    now = time.time()
    fps_val = 1 / (now - prev_t)
    prev_t = now
    cv2.putText(annotated, f'FPS: {fps_val:.1f} ({device})', (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Write frame to video file
    out.write(annotated)

    cv2.imshow('YOLOv8 Pose', annotated)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
out.release()  # Release the video writer
cv2.destroyAllWindows()
print(f"Video saved to {output_path}")