m1guelpf · Nisyhaal · Sep 5, 2024 · Sep 5, 2024 · Sep 6, 2024
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,7 @@ dist
 .DS_Store
 *.egg-info
 yt_whisper/__pycache__
+*.vtt
+.venv
+models
+build
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,32 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "yt_whisper"
+version = "1.0.0"
+description = "Generate subtitles for YouTube videos using Whisper"
+authors = [
+    { name = "Miguel Piedrafita", email = "your.email@example.com" }
+]
+dependencies = [
+    "yt-dlp",
+    "openai-whisper @ git+https://github.com/openai/whisper.git@main",
+    "openvino>=2024.1.0",
+    "nncf>=2.10.0",
+    "python-ffmpeg<=1.0.16",
+    "moviepy",
+    "transformers",
+    "onnx",
+    "optimum-intel @ git+https://github.com/huggingface/optimum-intel.git",
+    "peft==0.6.2",
+    "torch>=2.1,<2.4",
+    "torchvision<0.19.0",
+    "soundfile",
+    "librosa",
+    "jiwer",
+    "pytube @ git+https://github.com/garywu007/pytube.git"
+]
+
+[project.scripts]
+yt_whisper = "yt_whisper.cli:main"
diff --git a/setup.py b/setup.py
diff --git a/yt_whisper/cli.py b/yt_whisper/cli.py
@@ -4,8 +4,11 @@
 import argparse
 import warnings
 import yt_dlp
-from .utils import slugify, str2bool, write_srt, write_vtt
+from .utils import slugify, str2bool, write_srt, write_vtt, write_srt_openvino, write_vtt_openvino
 import tempfile
+from optimum.intel.openvino import OVModelForSpeechSeq2Seq
+from transformers import AutoProcessor, pipeline
+import subprocess
 
 
 def main():
@@ -25,40 +28,73 @@ def main():
                         "transcribe", "translate"], help="whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')")
     parser.add_argument("--language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]),
                         help="language spoken in the audio, skip to perform language detection")
-
+    parser.add_argument("--openvino", action='store_true',
+                        help="Whether to use openvino pipeline for inferencing.")
     parser.add_argument("--break-lines", type=int, default=0, 
                         help="Whether to break lines into a bottom-heavy pyramid shape if line length exceeds N characters. 0 disables line breaking.")
 
     args = parser.parse_args().__dict__
+    is_openvino = args.pop("openvino")
     model_name: str = args.pop("model")
     output_dir: str = args.pop("output_dir")
     subtitles_format: str = args.pop("format")
     os.makedirs(output_dir, exist_ok=True)
+    os.makedirs("models", exist_ok=True)
+
+    model_dir = os.path.join("models", model_name)
 
     if model_name.endswith(".en"):
         warnings.warn(
             f"{model_name} is an English-only model, forcing English detection.")
         args["language"] = "en"
 
-    model = whisper.load_model(model_name)
+    if is_openvino:
+        if not os.path.exists(model_dir):
+            bash_command = ["optimum-cli", "export", "openvino", "-m", f"openai/whisper-{model_name}", model_dir, "--weight-format", "fp16"]
+            subprocess.run(bash_command, check=True)
+            print(f"Model downloaded and coverted to OpenVINO Intermediate Representation (IR) successfully.")
+        ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device="cpu")
+        processor = AutoProcessor.from_pretrained(model_dir)
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=ov_model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            generate_kwargs={"task": "transcribe"},
+            return_timestamps=True
+        )
+    else:
+        model = whisper.load_model(model_name)
+
     audios = get_audio(args.pop("video"))
     break_lines = args.pop("break_lines")
 
     for title, audio_path in audios.items():
         warnings.filterwarnings("ignore")
-        result = model.transcribe(audio_path, **args)
-        warnings.filterwarnings("default")
+        if is_openvino:
+            result = pipe(audio_path)
+            transcript = result["chunks"]
+        else:
+            result = model.transcribe(audio_path, **args)
+            warnings.filterwarnings("default")
+            transcript = result["segments"]
 
         if (subtitles_format == 'vtt'):
             vtt_path = os.path.join(output_dir, f"{slugify(title)}.vtt")
             with open(vtt_path, 'w', encoding="utf-8") as vtt:
-                write_vtt(result["segments"], file=vtt, line_length=break_lines)
+                if is_openvino:
+                    write_vtt_openvino(transcript, file=vtt, line_length=break_lines)
+                else:
+                    write_vtt(transcript, file=vtt, line_length=break_lines)
 
             print("Saved VTT to", os.path.abspath(vtt_path))
         else:
             srt_path = os.path.join(output_dir, f"{slugify(title)}.srt")
             with open(srt_path, 'w', encoding="utf-8") as srt:
-                write_srt(result["segments"], file=srt, line_length=break_lines)
+                if is_openvino:
+                    write_srt_openvino(transcript, file=srt, line_length=break_lines)
+                else:
+                    write_srt(transcript, file=srt, line_length=break_lines)
 
             print("Saved SRT to", os.path.abspath(srt_path))
 

diff --git a/yt_whisper/utils.py b/yt_whisper/utils.py
@@ -1,4 +1,5 @@
 from typing import Iterator, TextIO
+import math
 
 
 def str2bool(string):
@@ -79,3 +80,27 @@ def write_srt(transcript: Iterator[dict], file: TextIO, line_length: int = 0):
 def slugify(title):
     return "".join(c if c.isalnum() else "_" for c in title).rstrip("_")
 
+def write_vtt_openvino(transcript: Iterator[dict], file: TextIO, line_length: int = 0):
+    print("WEBVTT\n", file=file)
+    for segment in transcript:
+        segment = process_segment(segment, line_length=line_length)
+
+        print(
+            f"{format_timestamp(segment['timestamp'][0])} --> {format_timestamp(segment['timestamp'][1])}\n"
+            f"{segment['text'].strip().replace('-->', '->')}\n",
+            file=file,
+            flush=True,
+        )
+
+def write_srt_openvino(transcript: Iterator[dict], file: TextIO, line_length: int = 0):
+    for i, segment in enumerate(transcript, start=1):
+        segment = process_segment(segment, line_length=line_length)
+
+        print(
+            f"{i}\n"
+            f"{format_timestamp(segment['timestamp'][0], always_include_hours=True, decimal_marker=',')} --> "
+            f"{format_timestamp(segment['timestamp'][1], always_include_hours=True, decimal_marker=',')}\n"
+            f"{segment['text'].strip().replace('-->', '->')}\n",
+            file=file,
+            flush=True,
+        )