Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Apr 8, 2024

Commit

b17cfc2

1 Parent(s): 5c9c269

refactoring to use data class

Browse files

Files changed (1) hide show

modules/whisper_Inference.py +142 -202

modules/whisper_Inference.py CHANGED Viewed

@@ -10,6 +10,7 @@ import torch
 from .base_interface import BaseInterface
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.youtube_manager import get_ytdata, get_ytaudio
 DEFAULT_MODEL_SIZE = "large-v3"
@@ -21,82 +22,54 @@ class WhisperInference(BaseInterface):
         self.model = None
         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
         self.default_beam_size = 1
     def transcribe_file(self,
-                        fileobjs: list,
-                        model_size: str,
-                        lang: str,
                         file_format: str,
-                        istranslate: bool,
                         add_timestamp: bool,
-                        beam_size: int,
-                        log_prob_threshold: float,
-                        no_speech_threshold: float,
-                        compute_type: str,
-                        progress=gr.Progress()) -> list:
         """
         Write subtitle file from Files
         Parameters
         ----------
-        fileobjs: list
             List of files to transcribe from gr.Files()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
-            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         Returns
         ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             files_info = {}
-            for fileobj in fileobjs:
                 progress(0, desc="Loading Audio..")
-                audio = whisper.load_audio(fileobj.name)
-                result, elapsed_time = self.transcribe(audio=audio,
-                                                       lang=lang,
-                                                       istranslate=istranslate,
-                                                       beam_size=beam_size,
-                                                       log_prob_threshold=log_prob_threshold,
-                                                       no_speech_threshold=no_speech_threshold,
-                                                       compute_type=compute_type,
-                                                       progress=progress
-                                                       )
                 progress(1, desc="Completed!")
-                file_name, file_ext = os.path.splitext(os.path.basename(fileobj.name))
                 file_name = safe_filename(file_name)
                 subtitle, file_path = self.generate_and_write_file(
                     file_name=file_name,
@@ -104,7 +77,7 @@ class WhisperInference(BaseInterface):
                     add_timestamp=add_timestamp,
                     file_format=file_format
                 )
-                files_info[file_name] = {"subtitle": subtitle, "elapsed_time": elapsed_time, "path":  file_path}
             total_result = ''
             total_time = 0
@@ -114,100 +87,71 @@ class WhisperInference(BaseInterface):
                 total_result += f"{info['subtitle']}"
                 total_time += info["elapsed_time"]
-            gr_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
-            gr_file_path = [info['path'] for info in files_info.values()]
-            return [gr_str, gr_file_path]
         except Exception as e:
             print(f"Error transcribing file: {str(e)}")
         finally:
             self.release_cuda_memory()
-            self.remove_input_files([fileobj.name for fileobj in fileobjs])
     def transcribe_youtube(self,
-                           youtubelink: str,
-                           model_size: str,
-                           lang: str,
                            file_format: str,
-                           istranslate: bool,
                            add_timestamp: bool,
-                           beam_size: int,
-                           log_prob_threshold: float,
-                           no_speech_threshold: float,
-                           compute_type: str,
-                           progress=gr.Progress()) -> list:
         """
         Write subtitle file from Youtube
         Parameters
         ----------
-        youtubelink: str
-            Link of Youtube to transcribe from gr.Textbox()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         Returns
         ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             progress(0, desc="Loading Audio from Youtube..")
-            yt = get_ytdata(youtubelink)
             audio = whisper.load_audio(get_ytaudio(yt))
-            result, elapsed_time = self.transcribe(audio=audio,
-                                                   lang=lang,
-                                                   istranslate=istranslate,
-                                                   beam_size=beam_size,
-                                                   log_prob_threshold=log_prob_threshold,
-                                                   no_speech_threshold=no_speech_threshold,
-                                                   compute_type=compute_type,
-                                                   progress=progress)
             progress(1, desc="Completed!")
             file_name = safe_filename(yt.title)
-            subtitle, file_path = self.generate_and_write_file(
                 file_name=file_name,
                 transcribed_segments=result,
                 add_timestamp=add_timestamp,
                 file_format=file_format
             )
-            gr_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [gr_str, file_path]
         except Exception as e:
             print(f"Error transcribing youtube video: {str(e)}")
         finally:
             try:
                 if 'yt' not in locals():
-                    yt = get_ytdata(youtubelink)
                     file_path = get_ytaudio(yt)
                 else:
                     file_path = get_ytaudio(yt)
@@ -218,116 +162,71 @@ class WhisperInference(BaseInterface):
                 pass
     def transcribe_mic(self,
-                       micaudio: str,
-                       model_size: str,
-                       lang: str,
                        file_format: str,
-                       istranslate: bool,
-                       beam_size: int,
-                       log_prob_threshold: float,
-                       no_speech_threshold: float,
-                       compute_type: str,
-                       progress=gr.Progress()) -> list:
         """
         Write subtitle file from microphone
         Parameters
         ----------
-        micaudio: str
             Audio file path from gr.Microphone()
-        model_size: str
-            Whisper model size from gr.Dropdown()
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
         file_format: str
-            Subtitle format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
-            I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         Returns
         ----------
-        A List of
-        String to return to gr.Textbox()
-        Files to return to gr.Files()
         """
         try:
-            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
-            result, elapsed_time = self.transcribe(audio=micaudio,
-                                                   lang=lang,
-                                                   istranslate=istranslate,
-                                                   beam_size=beam_size,
-                                                   log_prob_threshold=log_prob_threshold,
-                                                   no_speech_threshold=no_speech_threshold,
-                                                   compute_type=compute_type,
-                                                   progress=progress)
             progress(1, desc="Completed!")
-            subtitle, file_path = self.generate_and_write_file(
                 file_name="Mic",
                 transcribed_segments=result,
                 add_timestamp=True,
                 file_format=file_format
             )
-            gr_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
-            return [gr_str, file_path]
         except Exception as e:
             print(f"Error transcribing mic: {str(e)}")
         finally:
             self.release_cuda_memory()
-            self.remove_input_files([micaudio])
     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
-                   lang: str,
-                   istranslate: bool,
-                   beam_size: int,
-                   log_prob_threshold: float,
-                   no_speech_threshold: float,
-                   compute_type: str,
-                   progress: gr.Progress
                    ) -> Tuple[List[dict], float]:
         """
-        transcribe method for OpenAI's Whisper implementation.
         Parameters
         ----------
-        audio: Union[str, BinaryIO, torch.Tensor]
             Audio path or file binary or Audio numpy array
-        lang: str
-            Source language of the file to transcribe from gr.Dropdown()
-        istranslate: bool
-            Boolean value from gr.Checkbox() that determines whether to translate to English.
-            It's Whisper's feature to translate speech from another language directly into English end-to-end.
-        beam_size: int
-            Int value from gr.Number() that is used for decoding option.
-        log_prob_threshold: float
-            float value from gr.Number(). If the average log probability over sampled tokens is
-            below this value, treat as failed.
-        no_speech_threshold: float
-            float value from gr.Number(). If the no_speech probability is higher than this value AND
-            the average log probability over sampled tokens is below `log_prob_threshold`,
-            consider the segment as silent.
-        compute_type: str
-            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         Returns
         ----------
@@ -337,45 +236,56 @@ class WhisperInference(BaseInterface):
             elapsed time for transcription
         """
         start_time = time.time()
         def progress_callback(progress_value):
             progress(progress_value, desc="Transcribing..")
-        if lang == "Automatic Detection":
-            lang = None
-        translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
         segments_result = self.model.transcribe(audio=audio,
-                                                language=lang,
                                                 verbose=False,
-                                                beam_size=beam_size,
-                                                logprob_threshold=log_prob_threshold,
-                                                no_speech_threshold=no_speech_threshold,
-                                                task="translate" if istranslate and self.current_model_size in translatable_model else "transcribe",
-                                                fp16=True if compute_type == "float16" else False,
                                                 progress_callback=progress_callback)["segments"]
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
-    def update_model_if_needed(self,
-                               model_size: str,
-                               compute_type: str,
-                               progress: gr.Progress,
-                               ):
         """
-        Initialize model if it doesn't match with current model setting
         """
-        if compute_type != self.current_compute_type:
-            self.current_compute_type = compute_type
-        if model_size != self.current_model_size or self.model is None:
-            progress(0, desc="Initializing Model..")
-            self.current_model_size = model_size
-            self.model = whisper.load_model(
-                name=model_size,
-                device=self.device,
-                download_root=os.path.join("models", "Whisper")
-            )
     @staticmethod
     def generate_and_write_file(file_name: str,
@@ -384,7 +294,25 @@ class WhisperInference(BaseInterface):
                                 file_format: str,
                                 ) -> str:
         """
-        This method writes subtitle file and returns str to gr.Textbox
         """
         timestamp = datetime.now().strftime("%m%d%H%M%S")
         if add_timestamp:
@@ -410,6 +338,18 @@ class WhisperInference(BaseInterface):
     @staticmethod
     def format_time(elapsed_time: float) -> str:
         hours, rem = divmod(elapsed_time, 3600)
         minutes, seconds = divmod(rem, 60)

 from .base_interface import BaseInterface
 from modules.subtitle_manager import get_srt, get_vtt, get_txt, write_file, safe_filename
 from modules.youtube_manager import get_ytdata, get_ytaudio
+from modules.whisper_data_class import *
 DEFAULT_MODEL_SIZE = "large-v3"
         self.model = None
         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
+        self.translatable_model = ["large", "large-v1", "large-v2", "large-v3"]
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.available_compute_types = ["float16", "float32"]
         self.current_compute_type = "float16" if self.device == "cuda" else "float32"
         self.default_beam_size = 1
     def transcribe_file(self,
+                        files: list,
                         file_format: str,
                         add_timestamp: bool,
+                        progress=gr.Progress(),
+                        *whisper_params
+                        ) -> list:
         """
         Write subtitle file from Files
         Parameters
         ----------
+        files: list
             List of files to transcribe from gr.Files()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         add_timestamp: bool
+            Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the subtitle filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
         """
         try:
             files_info = {}
+            for file in files:
                 progress(0, desc="Loading Audio..")
+                audio = whisper.load_audio(file.name)
+                result, elapsed_time = self.transcribe(audio,
+                                                       progress,
+                                                       *whisper_params)
                 progress(1, desc="Completed!")
+                file_name, file_ext = os.path.splitext(os.path.basename(file.name))
                 file_name = safe_filename(file_name)
                 subtitle, file_path = self.generate_and_write_file(
                     file_name=file_name,
                     add_timestamp=add_timestamp,
                     file_format=file_format
                 )
+                files_info[file_name] = {"subtitle": subtitle, "elapsed_time": elapsed_time, "path": file_path}
             total_result = ''
             total_time = 0
                 total_result += f"{info['subtitle']}"
                 total_time += info["elapsed_time"]
+            result_str = f"Done in {self.format_time(total_time)}! Subtitle is in the outputs folder.\n\n{total_result}"
+            result_file_path = [info['path'] for info in files_info.values()]
+            return [result_str, result_file_path]
         except Exception as e:
             print(f"Error transcribing file: {str(e)}")
         finally:
             self.release_cuda_memory()
+            self.remove_input_files([file.name for file in files])
     def transcribe_youtube(self,
+                           youtube_link: str,
                            file_format: str,
                            add_timestamp: bool,
+                           progress=gr.Progress(),
+                           *whisper_params) -> list:
         """
         Write subtitle file from Youtube
         Parameters
         ----------
+        youtube_link: str
+            URL of the Youtube video to transcribe from gr.Textbox()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         add_timestamp: bool
             Boolean value from gr.Checkbox() that determines whether to add a timestamp at the end of the filename.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
         """
         try:
             progress(0, desc="Loading Audio from Youtube..")
+            yt = get_ytdata(youtube_link)
             audio = whisper.load_audio(get_ytaudio(yt))
+            result, elapsed_time = self.transcribe(audio,
+                                                   progress,
+                                                   *whisper_params)
             progress(1, desc="Completed!")
             file_name = safe_filename(yt.title)
+            subtitle, result_file_path = self.generate_and_write_file(
                 file_name=file_name,
                 transcribed_segments=result,
                 add_timestamp=add_timestamp,
                 file_format=file_format
             )
+            result_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
         except Exception as e:
             print(f"Error transcribing youtube video: {str(e)}")
         finally:
             try:
                 if 'yt' not in locals():
+                    yt = get_ytdata(youtube_link)
                     file_path = get_ytaudio(yt)
                 else:
                     file_path = get_ytaudio(yt)
                 pass
     def transcribe_mic(self,
+                       mic_audio: str,
                        file_format: str,
+                       progress=gr.Progress(),
+                       *whisper_params) -> list:
         """
         Write subtitle file from microphone
         Parameters
         ----------
+        mic_audio: str
             Audio file path from gr.Microphone()
         file_format: str
+            Subtitle File format to write from gr.Dropdown(). Supported format: [SRT, WebVTT, txt]
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
+        result_str:
+            Result of transcription to return to gr.Textbox()
+        result_file_path:
+            Output file path to return to gr.Files()
         """
         try:
+            progress(0, desc="Loading Audio..")
+            result, elapsed_time = self.transcribe(
+                mic_audio,
+                progress,
+                *whisper_params,
+            )
             progress(1, desc="Completed!")
+            subtitle, result_file_path = self.generate_and_write_file(
                 file_name="Mic",
                 transcribed_segments=result,
                 add_timestamp=True,
                 file_format=file_format
             )
+            result_str = f"Done in {self.format_time(elapsed_time)}! Subtitle file is in the outputs folder.\n\n{subtitle}"
+            return [result_str, result_file_path]
         except Exception as e:
             print(f"Error transcribing mic: {str(e)}")
         finally:
             self.release_cuda_memory()
+            self.remove_input_files([mic_audio])
     def transcribe(self,
                    audio: Union[str, np.ndarray, torch.Tensor],
+                   progress: gr.Progress,
+                   *whisper_params,
                    ) -> Tuple[List[dict], float]:
         """
+        transcribe method for faster-whisper.
         Parameters
         ----------
+        audio: Union[str, BinaryIO, np.ndarray]
             Audio path or file binary or Audio numpy array
         progress: gr.Progress
             Indicator to show progress directly in gradio.
+        *whisper_params: tuple
+            Gradio components related to Whisper. see whisper_data_class.py for details.
         Returns
         ----------
             elapsed time for transcription
         """
         start_time = time.time()
+        params = WhisperGradioComponents.to_values(*whisper_params)
+        if params.model_size != self.current_model_size or self.model is None or self.current_compute_type != params.compute_type:
+            self.update_model(params.model_size, params.compute_type, progress)
+        if params.lang == "Automatic Detection":
+            params.lang = None
         def progress_callback(progress_value):
             progress(progress_value, desc="Transcribing..")
         segments_result = self.model.transcribe(audio=audio,
+                                                language=params.lang,
                                                 verbose=False,
+                                                beam_size=params.beam_size,
+                                                logprob_threshold=params.log_prob_threshold,
+                                                no_speech_threshold=params.no_speech_threshold,
+                                                task="translate" if params.is_translate and self.current_model_size in self.translatable_model else "transcribe",
+                                                fp16=True if params.compute_type == "float16" else False,
                                                 progress_callback=progress_callback)["segments"]
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
+    def update_model(self,
+                     model_size: str,
+                     compute_type: str,
+                     progress: gr.Progress,
+                     ):
         """
+        Update current model setting
+        Parameters
+        ----------
+        model_size: str
+            Size of whisper model
+        compute_type: str
+            Compute type for transcription.
+            see more info : https://opennmt.net/CTranslate2/quantization.html
+        progress: gr.Progress
+            Indicator to show progress directly in gradio.
         """
+        progress(0, desc="Initializing Model..")
+        self.current_compute_type = compute_type
+        self.current_model_size = model_size
+        self.model = whisper.load_model(
+            name=model_size,
+            device=self.device,
+            download_root=os.path.join("models", "Whisper")
+        )
     @staticmethod
     def generate_and_write_file(file_name: str,
                                 file_format: str,
                                 ) -> str:
         """
+        Writes subtitle file
+        Parameters
+        ----------
+        file_name: str
+            Output file name
+        transcribed_segments: list
+            Text segments transcribed from audio
+        add_timestamp: bool
+            Determines whether to add a timestamp to the end of the filename.
+        file_format: str
+            File format to write. Supported formats: [SRT, WebVTT, txt]
+        Returns
+        ----------
+        content: str
+            Result of the transcription
+        output_path: str
+            output file path
         """
         timestamp = datetime.now().strftime("%m%d%H%M%S")
         if add_timestamp:
     @staticmethod
     def format_time(elapsed_time: float) -> str:
+        """
+        Get {hours} {minutes} {seconds} time format string
+        Parameters
+        ----------
+        elapsed_time: str
+            Elapsed time for transcription
+        Returns
+        ----------
+        Time format string
+        """
         hours, rem = divmod(elapsed_time, 3600)
         minutes, seconds = divmod(rem, 60)