Spaces:
Running
Running
Merge pull request #153 from jhj0517/feature/add-params
Browse files- app.py +9 -3
- modules/faster_whisper_inference.py +1 -0
- modules/whisper_parameter.py +10 -3
app.py
CHANGED
|
@@ -59,6 +59,7 @@ class App:
|
|
| 59 |
with gr.Row():
|
| 60 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
| 61 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
|
| 62 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 63 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 64 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
@@ -89,7 +90,8 @@ class App:
|
|
| 89 |
condition_on_previous_text=cb_condition_on_previous_text,
|
| 90 |
initial_prompt=tb_initial_prompt,
|
| 91 |
temperature=sd_temperature,
|
| 92 |
-
compression_ratio_threshold=nb_compression_ratio_threshold
|
|
|
|
| 93 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
| 94 |
inputs=params + whisper_params.to_list(),
|
| 95 |
outputs=[tb_indicator, files_subtitles])
|
|
@@ -117,6 +119,7 @@ class App:
|
|
| 117 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
| 118 |
interactive=True)
|
| 119 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
|
| 120 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 121 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 122 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
@@ -147,7 +150,8 @@ class App:
|
|
| 147 |
condition_on_previous_text=cb_condition_on_previous_text,
|
| 148 |
initial_prompt=tb_initial_prompt,
|
| 149 |
temperature=sd_temperature,
|
| 150 |
-
compression_ratio_threshold=nb_compression_ratio_threshold
|
|
|
|
| 151 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
| 152 |
inputs=params + whisper_params.to_list(),
|
| 153 |
outputs=[tb_indicator, files_subtitles])
|
|
@@ -168,6 +172,7 @@ class App:
|
|
| 168 |
with gr.Row():
|
| 169 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
| 170 |
with gr.Accordion("Advanced_Parameters", open=False):
|
|
|
|
| 171 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 172 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 173 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
@@ -197,7 +202,8 @@ class App:
|
|
| 197 |
condition_on_previous_text=cb_condition_on_previous_text,
|
| 198 |
initial_prompt=tb_initial_prompt,
|
| 199 |
temperature=sd_temperature,
|
| 200 |
-
compression_ratio_threshold=nb_compression_ratio_threshold
|
|
|
|
| 201 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
| 202 |
inputs=params + whisper_params.to_list(),
|
| 203 |
outputs=[tb_indicator, files_subtitles])
|
|
|
|
| 59 |
with gr.Row():
|
| 60 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename", interactive=True)
|
| 61 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 62 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 63 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 64 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 65 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
|
| 90 |
condition_on_previous_text=cb_condition_on_previous_text,
|
| 91 |
initial_prompt=tb_initial_prompt,
|
| 92 |
temperature=sd_temperature,
|
| 93 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 94 |
+
vad_filter=cb_vad_filter)
|
| 95 |
btn_run.click(fn=self.whisper_inf.transcribe_file,
|
| 96 |
inputs=params + whisper_params.to_list(),
|
| 97 |
outputs=[tb_indicator, files_subtitles])
|
|
|
|
| 119 |
cb_timestamp = gr.Checkbox(value=True, label="Add a timestamp to the end of the filename",
|
| 120 |
interactive=True)
|
| 121 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 122 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 123 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 124 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 125 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
|
| 150 |
condition_on_previous_text=cb_condition_on_previous_text,
|
| 151 |
initial_prompt=tb_initial_prompt,
|
| 152 |
temperature=sd_temperature,
|
| 153 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 154 |
+
vad_filter=cb_vad_filter)
|
| 155 |
btn_run.click(fn=self.whisper_inf.transcribe_youtube,
|
| 156 |
inputs=params + whisper_params.to_list(),
|
| 157 |
outputs=[tb_indicator, files_subtitles])
|
|
|
|
| 172 |
with gr.Row():
|
| 173 |
cb_translate = gr.Checkbox(value=False, label="Translate to English?", interactive=True)
|
| 174 |
with gr.Accordion("Advanced_Parameters", open=False):
|
| 175 |
+
cb_vad_filter = gr.Checkbox(label="Enable Silero VAD Filter", value=False, interactive=True)
|
| 176 |
nb_beam_size = gr.Number(label="Beam Size", value=1, precision=0, interactive=True)
|
| 177 |
nb_log_prob_threshold = gr.Number(label="Log Probability Threshold", value=-1.0, interactive=True)
|
| 178 |
nb_no_speech_threshold = gr.Number(label="No Speech Threshold", value=0.6, interactive=True)
|
|
|
|
| 202 |
condition_on_previous_text=cb_condition_on_previous_text,
|
| 203 |
initial_prompt=tb_initial_prompt,
|
| 204 |
temperature=sd_temperature,
|
| 205 |
+
compression_ratio_threshold=nb_compression_ratio_threshold,
|
| 206 |
+
vad_filter=cb_vad_filter)
|
| 207 |
btn_run.click(fn=self.whisper_inf.transcribe_mic,
|
| 208 |
inputs=params + whisper_params.to_list(),
|
| 209 |
outputs=[tb_indicator, files_subtitles])
|
modules/faster_whisper_inference.py
CHANGED
|
@@ -271,6 +271,7 @@ class FasterWhisperInference(BaseInterface):
|
|
| 271 |
patience=params.patience,
|
| 272 |
temperature=params.temperature,
|
| 273 |
compression_ratio_threshold=params.compression_ratio_threshold,
|
|
|
|
| 274 |
)
|
| 275 |
progress(0, desc="Loading audio..")
|
| 276 |
|
|
|
|
| 271 |
patience=params.patience,
|
| 272 |
temperature=params.temperature,
|
| 273 |
compression_ratio_threshold=params.compression_ratio_threshold,
|
| 274 |
+
vad_filter=params.vad_filter,
|
| 275 |
)
|
| 276 |
progress(0, desc="Loading audio..")
|
| 277 |
|
modules/whisper_parameter.py
CHANGED
|
@@ -18,6 +18,7 @@ class WhisperGradioComponents:
|
|
| 18 |
initial_prompt: gr.Textbox
|
| 19 |
temperature: gr.Slider
|
| 20 |
compression_ratio_threshold: gr.Number
|
|
|
|
| 21 |
"""
|
| 22 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
| 23 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
|
@@ -66,12 +67,17 @@ class WhisperGradioComponents:
|
|
| 66 |
to make it more likely to predict those word correctly.
|
| 67 |
|
| 68 |
temperature: gr.Slider
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
|
| 73 |
compression_ratio_threshold: gr.Number
|
| 74 |
If the gzip compression ratio is above this value, treat as failed
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
"""
|
| 76 |
|
| 77 |
def to_list(self) -> list:
|
|
@@ -101,6 +107,7 @@ class WhisperValues:
|
|
| 101 |
initial_prompt: Optional[str]
|
| 102 |
temperature: float
|
| 103 |
compression_ratio_threshold: float
|
|
|
|
| 104 |
"""
|
| 105 |
A data class to use Whisper parameters. Use "after" Gradio pre-processing.
|
| 106 |
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|
|
|
|
| 18 |
initial_prompt: gr.Textbox
|
| 19 |
temperature: gr.Slider
|
| 20 |
compression_ratio_threshold: gr.Number
|
| 21 |
+
vad_filter: gr.Checkbox
|
| 22 |
"""
|
| 23 |
A data class for Gradio components of the Whisper Parameters. Use "before" Gradio pre-processing.
|
| 24 |
See more about Gradio pre-processing: https://www.gradio.app/docs/components
|
|
|
|
| 67 |
to make it more likely to predict those word correctly.
|
| 68 |
|
| 69 |
temperature: gr.Slider
|
| 70 |
+
Temperature for sampling. It can be a tuple of temperatures,
|
| 71 |
+
which will be successively used upon failures according to either
|
| 72 |
+
`compression_ratio_threshold` or `log_prob_threshold`.
|
| 73 |
|
| 74 |
compression_ratio_threshold: gr.Number
|
| 75 |
If the gzip compression ratio is above this value, treat as failed
|
| 76 |
+
|
| 77 |
+
vad_filter: gr.Checkbox
|
| 78 |
+
Enable the voice activity detection (VAD) to filter out parts of the audio
|
| 79 |
+
without speech. This step is using the Silero VAD model
|
| 80 |
+
https://github.com/snakers4/silero-vad.
|
| 81 |
"""
|
| 82 |
|
| 83 |
def to_list(self) -> list:
|
|
|
|
| 107 |
initial_prompt: Optional[str]
|
| 108 |
temperature: float
|
| 109 |
compression_ratio_threshold: float
|
| 110 |
+
vad_filter: bool
|
| 111 |
"""
|
| 112 |
A data class to use Whisper parameters. Use "after" Gradio pre-processing.
|
| 113 |
See more about Gradio pre-processing: : https://www.gradio.app/docs/components
|