Whisper is an open source model for speech recognition developed by OpenAI.
We will try to use it to add subtitles to a recent interview.
Here is the original interview:
Download audio and video from Youtube
Code
import youtube_dl as ydl
video_url = 'https://www.youtube.com/watch?v=RYXcR3YejwY'
ydl_audio_opts = {
'outtmpl': 'whisper_messi_corto.%(ext)s',
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192'}]
}
ydl_video_opts = {
'outtmpl': 'whisper_messi_corto.%(ext)s',
}
ydl.YoutubeDL(ydl_audio_opts).download([video_url])
ydl.YoutubeDL(ydl_video_opts).download([video_url])
:::
Generate the transcript
Code
import whisperx
device = "cuda"
audio_file = "whisper_messi_corto.mp3"
# transcribe with original whisper
model = whisperx.load_model("large-v2", device)
result = model.transcribe(audio_file)
print(result["segments"]) # before alignment
# load alignment model and metadata
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
# align whisper output
result_aligned = whisperx.align(result["segments"], model_a, metadata, audio_file, device)
Embed the transcript in the video as subtitles
Code
import pandas as pd
import cv2
from moviepy.editor import VideoFileClip
import moviepy.editor as mp
from moviepy.editor import *
from moviepy.video.tools.subtitles import SubtitlesClip
dict1 = {'start':[], 'end':[], 'text':[]}
for i in result_aligned['segments']:
dict1['start'].append(int(i['start']))
dict1['end'].append(int(i['end']))
dict1['text'].append(i['text'])
df = pd.DataFrame.from_dict(dict1)
df.to_csv(f'whisper_messi_corto_subs.csv')
videocap = cv2.VideoCapture("whisper_messi_corto.mp4")
success, image = videocap.read()
height = image.shape[0]
width =image.shape[1]
generator = lambda txt: TextClip(txt, font='P052-Bold', fontsize=width/50, stroke_width=.7, color='white', stroke_color = 'black', size = (width, height*.25), method='caption')
subs = tuple(zip(tuple(zip(df['start'].values, df['end'].values)), df['text'].values))
subtitles = SubtitlesClip(subs, generator)
video = VideoFileClip('whisper_messi_corto.mp4')
final = CompositeVideoClip([video, subtitles.set_pos(('center','bottom'))])
final.write_videofile('whisper_messi_corto_con_subs.mp4', fps=video.fps, remove_temp=True, codec="libx264", audio_codec="aac")
Here is the final product, the video with the subtitles: