You Only Look Once

You Only Look Once

computer-vision
deep-learning
python
yolo
ultralytics
Autor/a

Martin Olmos

Fecha de publicación

11 de enero de 2026

You Only Look Once (YOLO) es una serie de modelos de detección de objetos en tiempo real y de código abierto que pueden identificar y clasificar múltiples objetos dentro de una imagen o fotograma de video. A diferencia de los métodos tradicionales de detección de objetos que se basan en redes de propuestas de regiones, YOLO trata la detección de objetos como un único problema de regresión, prediciendo directamente los cuadros delimitadores y las probabilidades de clase a partir de toda la imagen en una sola evaluación.

La serie Ultralytics YOLOv8 admite varias tareas, incluida la detección de objetos, la estimación de poses, la segmentación y la clasificación. Es conocida por su velocidad y precisión, lo que la hace adecuada para aplicaciones en tiempo real como la vigilancia, la conducción autónoma y la robótica.

Aquí está el código para ejecutar un modelo YOLOv8 utilizando la biblioteca Ultralytics en Python para la detección de objetos y la estimación de poses:

Código
from ultralytics import YOLO
import cv2
import math 
import torch


CAM_INDEX = 0  # change if you have multiple cameras
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# model
model = YOLO("yolo-Weights/yolov8n.pt")

# Optional: move model to GPU (if available)
try:
    model.to(device)
except Exception:
    pass  # older ultralytics handles device per-predict call


# start webcam
cap = cv2.VideoCapture(CAM_INDEX)
cap.set(3, 640)
cap.set(4, 480)

# object classes
classNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
              "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
              "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
              "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
              "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
              "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
              "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
              "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
              "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
              "teddy bear", "hair drier", "toothbrush"
              ]

# Video output settings
output_path = "output_detection.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = 30
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

while True:
    success, img = cap.read()
    if not success:
        break
    results = model(img, stream=True, device="cuda")

    # coordinates
    for r in results:
        boxes = r.boxes

        for box in boxes:
            # bounding box
            x1, y1, x2, y2 = box.xyxy[0]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values

            # put box in cam
            cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)

            # confidence
            confidence = math.ceil((box.conf[0]*100))/100
            print("Confidence --->",confidence)

            # class name
            cls = int(box.cls[0])
            print("Class name -->", classNames[cls])

            # object details
            org = [x1, y1]
            font = cv2.FONT_HERSHEY_SIMPLEX
            fontScale = 1
            color = (255, 0, 0)
            thickness = 2

            cv2.putText(img, classNames[cls], org, font, fontScale, color, thickness)

    # Write frame to video file
    out.write(img)
    
    cv2.imshow('Webcam', img)
    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
out.release()  # Release the video writer
cv2.destroyAllWindows()
print(f"Video saved to {output_path}")

Código
# Realtime pose estimation with YOLOv8 (OpenCV window)
import cv2, torch, time
from ultralytics import YOLO

CAM_INDEX = 0  # change if you have multiple cameras
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = YOLO('yolov8n-pose.pt')  # or 'yolov8s-pose.pt'
# Optional: move model to GPU (if available)
try:
    model.to(device)
except Exception:
    pass  # older ultralytics handles device per-predict call

cap = cv2.VideoCapture(CAM_INDEX)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1280)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 720)

# Video output settings
output_path = "output_pose.mp4"
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
fps = 30
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_path, fourcc, fps, (frame_width, frame_height))

prev_t = time.time()
while True:
    ok, frame = cap.read()
    if not ok:
        break

    results = model.predict(
        frame,
        device=device,
        imgsz=640,
        conf=0.5,
        half=(device == 'cuda'),
        verbose=False
    )
    annotated = results[0].plot()

    # FPS overlay
    now = time.time()
    fps_val = 1 / (now - prev_t)
    prev_t = now
    cv2.putText(annotated, f'FPS: {fps_val:.1f} ({device})', (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # Write frame to video file
    out.write(annotated)

    cv2.imshow('YOLOv8 Pose', annotated)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
out.release()  # Release the video writer
cv2.destroyAllWindows()
print(f"Video saved to {output_path}")