Using Whisper to Transcribe Messi

Transcribir a Messi con Whisper

NLP
IA
ML
Python
Messi
Author

Martin Olmos

Published

February 3, 2023

Whisper is an open source model for speech recognition developed by OpenAI.

We will try to use it to add subtitles to a recent interview.

Here is the original interview:

Download audio and video from Youtube

Code
import youtube_dl as ydl

video_url = 'https://www.youtube.com/watch?v=RYXcR3YejwY'

ydl_audio_opts = {
    'outtmpl': 'whisper_messi_corto.%(ext)s',
    'format': 'bestaudio/best',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'mp3',
        'preferredquality': '192'}]
}
ydl_video_opts = {
    'outtmpl': 'whisper_messi_corto.%(ext)s',
}

ydl.YoutubeDL(ydl_audio_opts).download([video_url])
ydl.YoutubeDL(ydl_video_opts).download([video_url])

:::

Generate the transcript

Code
import whisperx

device = "cuda" 
audio_file = "whisper_messi_corto.mp3"

# transcribe with original whisper
model = whisperx.load_model("large-v2", device)
result = model.transcribe(audio_file)

print(result["segments"]) # before alignment

# load alignment model and metadata
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)

# align whisper output
result_aligned = whisperx.align(result["segments"], model_a, metadata, audio_file, device)

Embed the transcript in the video as subtitles

Code
import pandas as pd
import cv2
from moviepy.editor import VideoFileClip
import moviepy.editor as mp
from moviepy.editor import *
from moviepy.video.tools.subtitles import SubtitlesClip

dict1 = {'start':[], 'end':[], 'text':[]}
for i in result_aligned['segments']:
  dict1['start'].append(int(i['start']))
  dict1['end'].append(int(i['end']))
  dict1['text'].append(i['text'])

df = pd.DataFrame.from_dict(dict1)
df.to_csv(f'whisper_messi_corto_subs.csv')

videocap = cv2.VideoCapture("whisper_messi_corto.mp4")
success, image = videocap.read()
height = image.shape[0]
width =image.shape[1]

generator = lambda txt: TextClip(txt, font='P052-Bold', fontsize=width/50, stroke_width=.7, color='white', stroke_color = 'black', size = (width, height*.25), method='caption')

subs = tuple(zip(tuple(zip(df['start'].values, df['end'].values)), df['text'].values))
subtitles = SubtitlesClip(subs, generator)

video = VideoFileClip('whisper_messi_corto.mp4')
final = CompositeVideoClip([video, subtitles.set_pos(('center','bottom'))])
final.write_videofile('whisper_messi_corto_con_subs.mp4', fps=video.fps, remove_temp=True, codec="libx264", audio_codec="aac")

Here is the final product, the video with the subtitles: