Spaces:

jhj0517
/

Whisper-WebUI

Running

App Files Files Community

jhj0517 commited on Sep 25, 2023

Commit

d8dfcf0

1 Parent(s): 010b571

add compute_type dropdown

Browse files

Files changed (3) hide show

app.py +6 -3
modules/faster_whisper_inference.py +34 -21
modules/whisper_Inference.py +39 -15

app.py CHANGED Viewed

@@ -59,6 +59,7 @@ class App:
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -66,7 +67,7 @@ class App:
                         btn_openfolder = gr.Button('📂', scale=2)
                     params = [input_file, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp]
-                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold]
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + advanced_params,
                                   outputs=[tb_indicator])
@@ -97,6 +98,7 @@ class App:
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -104,7 +106,7 @@ class App:
                         btn_openfolder = gr.Button('📂', scale=2)
                     params = [tb_youtubelink, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp]
-                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold]
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + advanced_params,
                                   outputs=[tb_indicator])
@@ -128,6 +130,7 @@ class App:
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
@@ -135,7 +138,7 @@ class App:
                         btn_openfolder = gr.Button('📂', scale=2)
                     params = [mic_input, dd_model, dd_lang, dd_subformat, cb_translate]
-                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold]
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + advanced_params,
                                   outputs=[tb_indicator])

                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
+                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         btn_openfolder = gr.Button('📂', scale=2)
                     params = [input_file, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp]
+                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
                     btn_run.click(fn=self.whisper_inf.transcribe_file,
                                   inputs=params + advanced_params,
                                   outputs=[tb_indicator])
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
+                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         btn_openfolder = gr.Button('📂', scale=2)
                     params = [tb_youtubelink, dd_model, dd_lang, dd_subformat, cb_translate, cb_timestamp]
+                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
                     btn_run.click(fn=self.whisper_inf.transcribe_youtube,
                                   inputs=params + advanced_params,
                                   outputs=[tb_indicator])
                         nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
                         nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
                         nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
+                        dd_compute_type = gr.Dropdown(label="Compute Type", choices=self.whisper_inf.available_compute_types, value=self.whisper_inf.current_compute_type, interactive=True)
                     with gr.Row():
                         btn_run = gr.Button("GENERATE SUBTITLE FILE", variant="primary")
                     with gr.Row():
                         btn_openfolder = gr.Button('📂', scale=2)
                     params = [mic_input, dd_model, dd_lang, dd_subformat, cb_translate]
+                    advanced_params = [nb_beam_size, nb_log_prob_threshold, nb_no_speech_threshold, dd_compute_type]
                     btn_run.click(fn=self.whisper_inf.transcribe_mic,
                                   inputs=params + advanced_params,
                                   outputs=[tb_indicator])

modules/faster_whisper_inference.py CHANGED Viewed

@@ -24,9 +24,10 @@ class FasterWhisperInference(BaseInterface):
         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2"]
-        self.default_beam_size = 1
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        self.compute_type = "float16" if self.device == "cuda" else "float32"
     def transcribe_file(self,
                         fileobjs: list,
@@ -38,6 +39,7 @@ class FasterWhisperInference(BaseInterface):
                         beam_size: int,
                         log_prob_threshold: float,
                         no_speech_threshold: float,
                         progress=gr.Progress()
                         ) -> str:
         """
@@ -67,6 +69,9 @@ class FasterWhisperInference(BaseInterface):
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
@@ -75,8 +80,7 @@ class FasterWhisperInference(BaseInterface):
         String to return to gr.Textbox()
         """
         try:
-            if model_size != self.current_model_size or self.model is None:
-                self.initialize_model(model_size=model_size, progress=progress)
             if lang == "Automatic Detection":
                 lang = None
@@ -129,6 +133,7 @@ class FasterWhisperInference(BaseInterface):
                            beam_size: int,
                            log_prob_threshold: float,
                            no_speech_threshold: float,
                            progress=gr.Progress()
                            ) -> str:
         """
@@ -158,6 +163,9 @@ class FasterWhisperInference(BaseInterface):
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
@@ -166,8 +174,7 @@ class FasterWhisperInference(BaseInterface):
         String to return to gr.Textbox()
         """
         try:
-            if model_size != self.current_model_size or self.model is None:
-                self.initialize_model(model_size=model_size, progress=progress)
             if lang == "Automatic Detection":
                 lang = None
@@ -220,6 +227,7 @@ class FasterWhisperInference(BaseInterface):
                        beam_size: int,
                        log_prob_threshold: float,
                        no_speech_threshold: float,
                        progress=gr.Progress()
                        ) -> str:
         """
@@ -246,6 +254,9 @@ class FasterWhisperInference(BaseInterface):
         no_speech_threshold: float
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
@@ -255,8 +266,7 @@ class FasterWhisperInference(BaseInterface):
         String to return to gr.Textbox()
         """
         try:
-            if model_size != self.current_model_size or self.model is None:
-                self.initialize_model(model_size=model_size, progress=progress)
             if lang == "Automatic Detection":
                 lang = None
@@ -353,21 +363,24 @@ class FasterWhisperInference(BaseInterface):
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
-    def initialize_model(self,
-                         model_size: str,
-                         progress: gr.Progress
-                         ):
         """
-        Initialize model if it doesn't match with current model size
         """
-        progress(0, desc="Initializing Model..")
-        self.current_model_size = model_size
-        self.model = faster_whisper.WhisperModel(
-            device=self.device,
-            model_size_or_path=model_size,
-            download_root=os.path.join("models", "Whisper", "faster-whisper"),
-            compute_type=self.compute_type
-        )
     @staticmethod
     def generate_and_write_subtitle(file_name: str,

         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.translatable_models = ["large", "large-v1", "large-v2"]
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.available_compute_types = ["int8", "int8_float32", "int8_float16", "int8_bfloat16", "int16", "float16", "bfloat16", "float32"]
+        self.current_compute_type = "float16" if self.device == "cuda" else "float32"
+        self.default_beam_size = 1
     def transcribe_file(self,
                         fileobjs: list,
                         beam_size: int,
                         log_prob_threshold: float,
                         no_speech_threshold: float,
+                        compute_type: str,
                         progress=gr.Progress()
                         ) -> str:
         """
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
+            see more info : https://opennmt.net/CTranslate2/quantization.html
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         String to return to gr.Textbox()
         """
         try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             if lang == "Automatic Detection":
                 lang = None
                            beam_size: int,
                            log_prob_threshold: float,
                            no_speech_threshold: float,
+                           compute_type: str,
                            progress=gr.Progress()
                            ) -> str:
         """
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
+            see more info : https://opennmt.net/CTranslate2/quantization.html
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         String to return to gr.Textbox()
         """
         try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             if lang == "Automatic Detection":
                 lang = None
                        beam_size: int,
                        log_prob_threshold: float,
                        no_speech_threshold: float,
+                       compute_type: str,
                        progress=gr.Progress()
                        ) -> str:
         """
         no_speech_threshold: float
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
+        compute_type: str
+            compute type from gr.Dropdown().
+            see more info : https://opennmt.net/CTranslate2/quantization.html
             consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
         String to return to gr.Textbox()
         """
         try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             if lang == "Automatic Detection":
                 lang = None
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
+    def update_model_if_needed(self,
+                               model_size: str,
+                               compute_type: str,
+                               progress: gr.Progress
+                               ):
         """
+        Initialize model if it doesn't match with current model setting
         """
+        if model_size != self.current_model_size or self.model is None or self.current_compute_type != compute_type:
+            progress(0, desc="Initializing Model..")
+            self.current_model_size = model_size
+            self.current_compute_type = compute_type
+            self.model = faster_whisper.WhisperModel(
+                device=self.device,
+                model_size_or_path=model_size,
+                download_root=os.path.join("models", "Whisper", "faster-whisper"),
+                compute_type=self.current_compute_type
+            )
     @staticmethod
     def generate_and_write_subtitle(file_name: str,

modules/whisper_Inference.py CHANGED Viewed

@@ -22,6 +22,8 @@ class WhisperInference(BaseInterface):
         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.default_beam_size = 1
     def transcribe_file(self,
@@ -34,6 +36,7 @@ class WhisperInference(BaseInterface):
                         beam_size: int,
                         log_prob_threshold: float,
                         no_speech_threshold: float,
                         progress=gr.Progress()):
         """
         Write subtitle file from Files
@@ -62,14 +65,15 @@ class WhisperInference(BaseInterface):
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         """
         try:
-            if model_size != self.current_model_size or self.model is None:
-                self.initialize_model(model_size=model_size, progress=progress)
             files_info = {}
             for fileobj in fileobjs:
@@ -82,7 +86,9 @@ class WhisperInference(BaseInterface):
                                                        beam_size=beam_size,
                                                        log_prob_threshold=log_prob_threshold,
                                                        no_speech_threshold=no_speech_threshold,
-                                                       progress=progress)
                 progress(1, desc="Completed!")
                 file_name, file_ext = os.path.splitext(os.path.basename(fileobj.orig_name))
@@ -122,6 +128,7 @@ class WhisperInference(BaseInterface):
                            beam_size: int,
                            log_prob_threshold: float,
                            no_speech_threshold: float,
                            progress=gr.Progress()):
         """
         Write subtitle file from Youtube
@@ -150,13 +157,14 @@ class WhisperInference(BaseInterface):
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         """
         try:
-            if model_size != self.current_model_size or self.model is None:
-                self.initialize_model(model_size=model_size, progress=progress)
             progress(0, desc="Loading Audio from Youtube..")
             yt = get_ytdata(youtubelink)
@@ -168,6 +176,7 @@ class WhisperInference(BaseInterface):
                                                    beam_size=beam_size,
                                                    log_prob_threshold=log_prob_threshold,
                                                    no_speech_threshold=no_speech_threshold,
                                                    progress=progress)
             progress(1, desc="Completed!")
@@ -205,6 +214,7 @@ class WhisperInference(BaseInterface):
                        beam_size: int,
                        log_prob_threshold: float,
                        no_speech_threshold: float,
                        progress=gr.Progress()):
         """
         Write subtitle file from microphone
@@ -231,14 +241,15 @@ class WhisperInference(BaseInterface):
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         """
         try:
-            if model_size != self.current_model_size or self.model is None:
-                self.initialize_model(model_size=model_size, progress=progress)
             result, elapsed_time = self.transcribe(audio=micaudio,
                                                    lang=lang,
@@ -246,6 +257,7 @@ class WhisperInference(BaseInterface):
                                                    beam_size=beam_size,
                                                    log_prob_threshold=log_prob_threshold,
                                                    no_speech_threshold=no_speech_threshold,
                                                    progress=progress)
             progress(1, desc="Completed!")
@@ -271,6 +283,7 @@ class WhisperInference(BaseInterface):
                    beam_size: int,
                    log_prob_threshold: float,
                    no_speech_threshold: float,
                    progress: gr.Progress
                    ) -> Tuple[list[dict], float]:
         """
@@ -294,6 +307,8 @@ class WhisperInference(BaseInterface):
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
         progress: gr.Progress
             Indicator to show progress directly in gradio.
@@ -320,21 +335,30 @@ class WhisperInference(BaseInterface):
                                                 logprob_threshold=log_prob_threshold,
                                                 no_speech_threshold=no_speech_threshold,
                                                 task="translate" if istranslate and self.current_model_size in translatable_model else "transcribe",
                                                 progress_callback=progress_callback)["segments"]
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
-    def initialize_model(self,
-                         model_size: str,
-                         progress: gr.Progress
-                         ):
         """
-        Initialize model if it doesn't match with current model size
         """
-        progress(0, desc="Initializing Model..")
-        self.current_model_size = model_size
-        self.model = whisper.load_model(name=model_size, download_root=os.path.join("models", "Whisper"))
     @staticmethod
     def generate_and_write_subtitle(file_name: str,

         self.available_models = whisper.available_models()
         self.available_langs = sorted(list(whisper.tokenizer.LANGUAGES.values()))
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.available_compute_types = ["float16", "float32"]
+        self.current_compute_type = "float16" if self.device == "cuda" else "float32"
         self.default_beam_size = 1
     def transcribe_file(self,
                         beam_size: int,
                         log_prob_threshold: float,
                         no_speech_threshold: float,
+                        compute_type: str,
                         progress=gr.Progress()):
         """
         Write subtitle file from Files
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         """
         try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             files_info = {}
             for fileobj in fileobjs:
                                                        beam_size=beam_size,
                                                        log_prob_threshold=log_prob_threshold,
                                                        no_speech_threshold=no_speech_threshold,
+                                                       compute_type=compute_type,
+                                                       progress=progress
+                                                       )
                 progress(1, desc="Completed!")
                 file_name, file_ext = os.path.splitext(os.path.basename(fileobj.orig_name))
                            beam_size: int,
                            log_prob_threshold: float,
                            no_speech_threshold: float,
+                           compute_type: str,
                            progress=gr.Progress()):
         """
         Write subtitle file from Youtube
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         """
         try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             progress(0, desc="Loading Audio from Youtube..")
             yt = get_ytdata(youtubelink)
                                                    beam_size=beam_size,
                                                    log_prob_threshold=log_prob_threshold,
                                                    no_speech_threshold=no_speech_threshold,
+                                                   compute_type=compute_type,
                                                    progress=progress)
             progress(1, desc="Completed!")
                        beam_size: int,
                        log_prob_threshold: float,
                        no_speech_threshold: float,
+                       compute_type: str,
                        progress=gr.Progress()):
         """
         Write subtitle file from microphone
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
             I use a forked version of whisper for this. To see more info : https://github.com/jhj0517/jhj0517-whisper/tree/add-progress-callback
         """
         try:
+            self.update_model_if_needed(model_size=model_size, compute_type=compute_type, progress=progress)
             result, elapsed_time = self.transcribe(audio=micaudio,
                                                    lang=lang,
                                                    beam_size=beam_size,
                                                    log_prob_threshold=log_prob_threshold,
                                                    no_speech_threshold=no_speech_threshold,
+                                                   compute_type=compute_type,
                                                    progress=progress)
             progress(1, desc="Completed!")
                    beam_size: int,
                    log_prob_threshold: float,
                    no_speech_threshold: float,
+                   compute_type: str,
                    progress: gr.Progress
                    ) -> Tuple[list[dict], float]:
         """
             float value from gr.Number(). If the no_speech probability is higher than this value AND
             the average log probability over sampled tokens is below `log_prob_threshold`,
             consider the segment as silent.
+        compute_type: str
+            compute type from gr.Dropdown().
         progress: gr.Progress
             Indicator to show progress directly in gradio.
                                                 logprob_threshold=log_prob_threshold,
                                                 no_speech_threshold=no_speech_threshold,
                                                 task="translate" if istranslate and self.current_model_size in translatable_model else "transcribe",
+                                                fp16=True if compute_type == "float16" else False,
                                                 progress_callback=progress_callback)["segments"]
         elapsed_time = time.time() - start_time
         return segments_result, elapsed_time
+    def update_model_if_needed(self,
+                               model_size: str,
+                               compute_type: str,
+                               progress: gr.Progress,
+                               ):
         """
+        Initialize model if it doesn't match with current model setting
         """
+        if compute_type != self.current_compute_type:
+            self.current_compute_type = compute_type
+        if model_size != self.current_model_size or self.model is None:
+            progress(0, desc="Initializing Model..")
+            self.current_model_size = model_size
+            self.model = whisper.load_model(
+                name=model_size,
+                device=self.device,
+                download_root=os.path.join("models", "Whisper")
+            )
     @staticmethod
     def generate_and_write_subtitle(file_name: str,