Spaces:
Running
Running
jhj0517
commited on
Commit
·
824b9ef
1
Parent(s):
20c2916
migrate faster-whisper to 1.0.3
Browse files- modules/vad/silero_vad.py +13 -15
modules/vad/silero_vad.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from faster_whisper.vad import VadOptions
|
| 2 |
import numpy as np
|
| 3 |
from typing import BinaryIO, Union, List, Optional
|
| 4 |
import warnings
|
|
@@ -9,6 +9,8 @@ import gradio as gr
|
|
| 9 |
class SileroVAD:
|
| 10 |
def __init__(self):
|
| 11 |
self.sampling_rate = 16000
|
|
|
|
|
|
|
| 12 |
|
| 13 |
def run(self,
|
| 14 |
audio: Union[str, BinaryIO, np.ndarray],
|
|
@@ -54,8 +56,8 @@ class SileroVAD:
|
|
| 54 |
|
| 55 |
return audio
|
| 56 |
|
| 57 |
-
@staticmethod
|
| 58 |
def get_speech_timestamps(
|
|
|
|
| 59 |
audio: np.ndarray,
|
| 60 |
vad_options: Optional[VadOptions] = None,
|
| 61 |
progress: gr.Progress = gr.Progress(),
|
|
@@ -72,22 +74,16 @@ class SileroVAD:
|
|
| 72 |
Returns:
|
| 73 |
List of dicts containing begin and end samples of each speech chunk.
|
| 74 |
"""
|
| 75 |
-
|
| 76 |
-
|
|
|
|
| 77 |
|
| 78 |
threshold = vad_options.threshold
|
| 79 |
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
| 80 |
max_speech_duration_s = vad_options.max_speech_duration_s
|
| 81 |
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
| 82 |
-
window_size_samples =
|
| 83 |
speech_pad_ms = vad_options.speech_pad_ms
|
| 84 |
-
|
| 85 |
-
if window_size_samples not in [512, 1024, 1536]:
|
| 86 |
-
warnings.warn(
|
| 87 |
-
"Unusual window_size_samples! Supported window_size_samples:\n"
|
| 88 |
-
" - [512, 1024, 1536] for 16000 sampling_rate"
|
| 89 |
-
)
|
| 90 |
-
|
| 91 |
sampling_rate = 16000
|
| 92 |
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
| 93 |
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
|
@@ -101,8 +97,7 @@ class SileroVAD:
|
|
| 101 |
|
| 102 |
audio_length_samples = len(audio)
|
| 103 |
|
| 104 |
-
|
| 105 |
-
state = model.get_initial_state(batch_size=1)
|
| 106 |
|
| 107 |
speech_probs = []
|
| 108 |
for current_start_sample in range(0, audio_length_samples, window_size_samples):
|
|
@@ -111,7 +106,7 @@ class SileroVAD:
|
|
| 111 |
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
|
| 112 |
if len(chunk) < window_size_samples:
|
| 113 |
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
| 114 |
-
speech_prob, state = model(chunk, state, sampling_rate)
|
| 115 |
speech_probs.append(speech_prob)
|
| 116 |
|
| 117 |
triggered = False
|
|
@@ -207,6 +202,9 @@ class SileroVAD:
|
|
| 207 |
|
| 208 |
return speeches
|
| 209 |
|
|
|
|
|
|
|
|
|
|
| 210 |
@staticmethod
|
| 211 |
def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
|
| 212 |
"""Collects and concatenates audio chunks."""
|
|
|
|
| 1 |
+
from faster_whisper.vad import VadOptions, get_vad_model
|
| 2 |
import numpy as np
|
| 3 |
from typing import BinaryIO, Union, List, Optional
|
| 4 |
import warnings
|
|
|
|
| 9 |
class SileroVAD:
|
| 10 |
def __init__(self):
|
| 11 |
self.sampling_rate = 16000
|
| 12 |
+
self.window_size_samples = 512
|
| 13 |
+
self.model = None
|
| 14 |
|
| 15 |
def run(self,
|
| 16 |
audio: Union[str, BinaryIO, np.ndarray],
|
|
|
|
| 56 |
|
| 57 |
return audio
|
| 58 |
|
|
|
|
| 59 |
def get_speech_timestamps(
|
| 60 |
+
self,
|
| 61 |
audio: np.ndarray,
|
| 62 |
vad_options: Optional[VadOptions] = None,
|
| 63 |
progress: gr.Progress = gr.Progress(),
|
|
|
|
| 74 |
Returns:
|
| 75 |
List of dicts containing begin and end samples of each speech chunk.
|
| 76 |
"""
|
| 77 |
+
|
| 78 |
+
if self.model is None:
|
| 79 |
+
self.update_model()
|
| 80 |
|
| 81 |
threshold = vad_options.threshold
|
| 82 |
min_speech_duration_ms = vad_options.min_speech_duration_ms
|
| 83 |
max_speech_duration_s = vad_options.max_speech_duration_s
|
| 84 |
min_silence_duration_ms = vad_options.min_silence_duration_ms
|
| 85 |
+
window_size_samples = self.window_size_samples
|
| 86 |
speech_pad_ms = vad_options.speech_pad_ms
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
sampling_rate = 16000
|
| 88 |
min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
|
| 89 |
speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
|
|
|
| 97 |
|
| 98 |
audio_length_samples = len(audio)
|
| 99 |
|
| 100 |
+
state, context = self.model.get_initial_states(batch_size=1)
|
|
|
|
| 101 |
|
| 102 |
speech_probs = []
|
| 103 |
for current_start_sample in range(0, audio_length_samples, window_size_samples):
|
|
|
|
| 106 |
chunk = audio[current_start_sample: current_start_sample + window_size_samples]
|
| 107 |
if len(chunk) < window_size_samples:
|
| 108 |
chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
|
| 109 |
+
speech_prob, state, context = self.model(chunk, state, context, sampling_rate)
|
| 110 |
speech_probs.append(speech_prob)
|
| 111 |
|
| 112 |
triggered = False
|
|
|
|
| 202 |
|
| 203 |
return speeches
|
| 204 |
|
| 205 |
+
def update_model(self):
|
| 206 |
+
self.model = get_vad_model()
|
| 207 |
+
|
| 208 |
@staticmethod
|
| 209 |
def collect_chunks(audio: np.ndarray, chunks: List[dict]) -> np.ndarray:
|
| 210 |
"""Collects and concatenates audio chunks."""
|