Spaces:

csukuangfj
/

test

Runtime error

App Files Files Community

csukuangfj commited on Apr 22, 2023

Commit

1dfc17d

1 Parent(s): 60adf6c

small fixes

Browse files

Files changed (3) hide show

app.py +16 -6
examples.py +42 -0
model.py +146 -10

app.py CHANGED Viewed

@@ -19,6 +19,7 @@
 # References:
 # https://gradio.app/docs/#dropdown
 import logging
 import os
 import time
@@ -29,7 +30,7 @@ import torch
 import torchaudio
 from examples import examples
-from model import get_pretrained_model, language_to_models, sample_rate
 languages = list(language_to_models.keys())
@@ -39,6 +40,15 @@ def convert_to_wav(in_filename: str) -> str:
     out_filename = in_filename + ".wav"
     logging.info(f"Converting '{in_filename}' to '{out_filename}'")
     _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 '{out_filename}'")
     return out_filename
@@ -136,12 +146,8 @@ def process(
         decoding_method=decoding_method,
         num_active_paths=num_active_paths,
     )
-    s = recognizer.create_stream()
-    s.accept_wave_file(filename)
-    recognizer.decode_stream(s)
-    text = s.result.text
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
@@ -173,6 +179,10 @@ title = "# Automatic Speech Recognition with Next-gen Kaldi"
 description = """
 This space shows how to do automatic speech recognition with Next-gen Kaldi.
 It is running on CPU within a docker container provided by Hugging Face.
 See more information by visiting the following links:

 # References:
 # https://gradio.app/docs/#dropdown
+import base64
 import logging
 import os
 import time
 import torchaudio
 from examples import examples
+from model import decode, get_pretrained_model, language_to_models, sample_rate
 languages = list(language_to_models.keys())
     out_filename = in_filename + ".wav"
     logging.info(f"Converting '{in_filename}' to '{out_filename}'")
     _ = os.system(f"ffmpeg -hide_banner -i '{in_filename}' -ar 16000 '{out_filename}'")
+    _ = os.system(
+        f"ffmpeg -hide_banner -loglevel error -i '{in_filename}' -ar 16000 '{out_filename}.flac'"
+    )
+    with open(out_filename + ".flac", "rb") as f:
+        s = "\n" + out_filename + "\n"
+        s += base64.b64encode(f.read()).decode()
+        logging.info(s)
     return out_filename
         decoding_method=decoding_method,
         num_active_paths=num_active_paths,
     )
+    text = decode(recognizer, filename)
     date_time = now.strftime("%Y-%m-%d %H:%M:%S.%f")
     end = time.time()
 description = """
 This space shows how to do automatic speech recognition with Next-gen Kaldi.
+Please visit
+<https://huggingface.co/spaces/k2-fsa/streaming-automatic-speech-recognition>
+for streaming speech recognition with **Next-gen Kaldi**.
 It is running on CPU within a docker container provided by Hugging Face.
 See more information by visiting the following links:

examples.py CHANGED Viewed

@@ -58,6 +58,48 @@ examples = [
         4,
         "./test_wavs/tibetan/a_0_cacm-A70_31117.wav",
     ],
     # librispeech
     # https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/test_wavs
     [

         4,
         "./test_wavs/tibetan/a_0_cacm-A70_31117.wav",
     ],
+    [
+        "Chinese",
+        "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
+        "greedy_search",
+        4,
+        "./test_wavs/alimeeting/R8003_M8001-8004-165.wav",
+    ],
+    [
+        "Chinese",
+        "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
+        "greedy_search",
+        4,
+        "./test_wavs/alimeeting/R8008_M8013-8049-74.wav",
+    ],
+    [
+        "Chinese",
+        "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
+        "greedy_search",
+        4,
+        "./test_wavs/alimeeting/R8009_M8020_N_SPK8026-8026-209.wav",
+    ],
+    [
+        "English",
+        "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
+        "greedy_search",
+        4,
+        "./test_wavs/tedlium3/DanBarber_2010-219.wav",
+    ],
+    [
+        "English",
+        "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
+        "greedy_search",
+        4,
+        "./test_wavs/tedlium3/DanielKahneman_2010-157.wav",
+    ],
+    [
+        "English",
+        "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
+        "greedy_search",
+        4,
+        "./test_wavs/tedlium3/RobertGupta_2010U-15.wav",
+    ],
     # librispeech
     # https://huggingface.co/csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless5-2022-05-13/tree/main/test_wavs
     [

model.py CHANGED Viewed

@@ -14,9 +14,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from huggingface_hub import hf_hub_download
-from functools import lru_cache
 import os
 os.system(
     "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/"
@@ -25,10 +29,59 @@ os.system(
 import k2
 import sherpa
 sample_rate = 16000
 @lru_cache(maxsize=30)
 def get_pretrained_model(
     repo_id: str,
@@ -59,6 +112,10 @@ def get_pretrained_model(
         return german_models[repo_id](
             repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
         )
     else:
         raise ValueError(f"Unsupported repo_id: {repo_id}")
@@ -176,7 +233,7 @@ def _get_gigaspeech_pre_trained_model(
 @lru_cache(maxsize=10)
-def _get_librispeech_pre_trained_model(
     repo_id: str,
     decoding_method: str,
     num_active_paths: int,
@@ -186,6 +243,9 @@ def _get_librispeech_pre_trained_model(
         "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",  # noqa
         "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11",  # noqa
         "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14",  # noqa
     ], repo_id
     filename = "cpu_jit.pt"
@@ -205,7 +265,15 @@ def _get_librispeech_pre_trained_model(
         repo_id=repo_id,
         filename=filename,
     )
-    tokens = _get_token_filename(repo_id=repo_id, subfolder="data/lang_bpe_500")
     feat_config = sherpa.FeatureConfig()
     feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
@@ -311,12 +379,18 @@ def _get_alimeeting_pre_trained_model(
     num_active_paths: int,
 ):
     assert repo_id in [
         "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
     ], repo_id
     nn_model = _get_nn_model_filename(
         repo_id=repo_id,
-        filename="cpu_jit_torch_1.7.1.pt",
     )
     tokens = _get_token_filename(repo_id=repo_id)
@@ -530,21 +604,76 @@ def _get_german_pre_trained_model(
     return recognizer
 chinese_models = {
     "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model,  # noqa
     "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model,  # noqa
     "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model,  # noqa
     "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode,  # noqa
     "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model,  # noqa
     "csukuangfj/wenet-chinese-model": _get_wenet_model,
 }
 english_models = {
     "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model,  # noqa
-    "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02": _get_librispeech_pre_trained_model,  # noqa
-    "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14": _get_librispeech_pre_trained_model,  # noqa
-    "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11": _get_librispeech_pre_trained_model,  # noqa
-    "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_librispeech_pre_trained_model,  # noqa
     "csukuangfj/wenet-english-model": _get_wenet_model,
 }
@@ -566,10 +695,16 @@ german_models = {
     "csukuangfj/wav2vec2.0-torchaudio": _get_german_pre_trained_model,
 }
 all_models = {
     **chinese_models,
     **english_models,
     **chinese_english_mixed_models,
     **tibetan_models,
     **arabic_models,
     **german_models,
@@ -579,6 +714,7 @@ language_to_models = {
     "Chinese": list(chinese_models.keys()),
     "English": list(english_models.keys()),
     "Chinese+English": list(chinese_english_mixed_models.keys()),
     "Tibetan": list(tibetan_models.keys()),
     "Arabic": list(arabic_models.keys()),
     "German": list(german_models.keys()),

 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+from functools import lru_cache
+from typing import Union
+import torch
+import torchaudio
+from huggingface_hub import hf_hub_download
 os.system(
     "cp -v /home/user/.local/lib/python3.8/site-packages/k2/lib/*.so /home/user/.local/lib/python3.8/site-packages/sherpa/lib/"
 import k2
 import sherpa
 sample_rate = 16000
+def decode_offline_recognizer(
+    recognizer: Union[sherpa.OfflineRecognizer, sherpa.OnlineRecognizer],
+    filename: str,
+) -> str:
+    s = recognizer.create_stream()
+    s.accept_wave_file(filename)
+    recognizer.decode_stream(s)
+    text = s.result.text.strip()
+    return text.lower()
+def decode_online_recognizer(
+    recognizer: Union[sherpa.OfflineRecognizer, sherpa.OnlineRecognizer],
+    filename: str,
+) -> str:
+    samples, actual_sample_rate = torchaudio.load(filename)
+    assert sample_rate == actual_sample_rate, (
+        sample_rate,
+        actual_sample_rate,
+    )
+    samples = samples[0].contiguous()
+    s = recognizer.create_stream()
+    tail_padding = torch.zeros(int(sample_rate * 0.3), dtype=torch.float32)
+    s.accept_waveform(sample_rate, samples)
+    s.accept_waveform(sample_rate, tail_padding)
+    s.input_finished()
+    while recognizer.is_ready(s):
+        recognizer.decode_stream(s)
+    text = recognizer.get_result(s).text
+    return text.strip().lower()
+def decode(
+    recognizer: Union[sherpa.OfflineRecognizer, sherpa.OnlineRecognizer],
+    filename: str,
+) -> str:
+    if isinstance(recognizer, sherpa.OfflineRecognizer):
+        return decode_offline_recognizer(recognizer, filename)
+    elif isinstance(recognizer, sherpa.OnlineRecognizer):
+        return decode_online_recognizer(recognizer, filename)
+    else:
+        raise ValueError(f"Unknown recongizer type {type(recognizer)}")
 @lru_cache(maxsize=30)
 def get_pretrained_model(
     repo_id: str,
         return german_models[repo_id](
             repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
         )
+    elif repo_id in japanese_models:
+        return japanese_models[repo_id](
+            repo_id, decoding_method=decoding_method, num_active_paths=num_active_paths
+        )
     else:
         raise ValueError(f"Unsupported repo_id: {repo_id}")
 @lru_cache(maxsize=10)
+def _get_english_model(
     repo_id: str,
     decoding_method: str,
     num_active_paths: int,
         "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13",  # noqa
         "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11",  # noqa
         "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14",  # noqa
+        "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
+        "pkufool/icefall_asr_librispeech_conformer_ctc",
+        "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21",
     ], repo_id
     filename = "cpu_jit.pt"
         repo_id=repo_id,
         filename=filename,
     )
+    subfolder = "data/lang_bpe_500"
+    if repo_id in (
+        "videodanchik/icefall-asr-tedlium3-conformer-ctc2",
+        "pkufool/icefall_asr_librispeech_conformer_ctc",
+    ):
+        subfolder = "data/lang_bpe"
+    tokens = _get_token_filename(repo_id=repo_id, subfolder=subfolder)
     feat_config = sherpa.FeatureConfig()
     feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
     num_active_paths: int,
 ):
     assert repo_id in [
+        "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
         "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2",
     ], repo_id
+    if repo_id == "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7":
+        filename = "cpu_jit.pt"
+    elif repo_id == "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2":
+        filename = "cpu_jit_torch_1.7.1.pt"
     nn_model = _get_nn_model_filename(
         repo_id=repo_id,
+        filename=filename,
     )
     tokens = _get_token_filename(repo_id=repo_id)
     return recognizer
+@lru_cache(maxsize=10)
+def _get_japanese_pre_trained_model(
+    repo_id: str,
+    decoding_method: str,
+    num_active_paths: int,
+):
+    repo_id, kind = repo_id.rsplit("-", maxsplit=1)
+    assert repo_id in [
+        "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208"
+    ], repo_id
+    assert kind in ("fluent", "disfluent"), kind
+    encoder_model = _get_nn_model_filename(
+        repo_id=repo_id, filename="encoder_jit_trace.pt", subfolder=f"exp_{kind}"
+    )
+    decoder_model = _get_nn_model_filename(
+        repo_id=repo_id, filename="decoder_jit_trace.pt", subfolder=f"exp_{kind}"
+    )
+    joiner_model = _get_nn_model_filename(
+        repo_id=repo_id, filename="joiner_jit_trace.pt", subfolder=f"exp_{kind}"
+    )
+    tokens = _get_token_filename(repo_id=repo_id)
+    feat_config = sherpa.FeatureConfig()
+    feat_config.fbank_opts.frame_opts.samp_freq = sample_rate
+    feat_config.fbank_opts.mel_opts.num_bins = 80
+    feat_config.fbank_opts.frame_opts.dither = 0
+    config = sherpa.OnlineRecognizerConfig(
+        nn_model="",
+        encoder_model=encoder_model,
+        decoder_model=decoder_model,
+        joiner_model=joiner_model,
+        tokens=tokens,
+        use_gpu=False,
+        feat_config=feat_config,
+        decoding_method=decoding_method,
+        num_active_paths=num_active_paths,
+        chunk_size=32,
+    )
+    recognizer = sherpa.OnlineRecognizer(config)
+    return recognizer
 chinese_models = {
     "luomingshuang/icefall_asr_wenetspeech_pruned_transducer_stateless2": _get_wenetspeech_pre_trained_model,  # noqa
+    "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7": _get_alimeeting_pre_trained_model,
     "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-A-2022-07-12": _get_aishell2_pretrained_model,  # noqa
     "yuekai/icefall-asr-aishell2-pruned-transducer-stateless5-B-2022-07-12": _get_aishell2_pretrained_model,  # noqa
     "luomingshuang/icefall_asr_aidatatang-200zh_pruned_transducer_stateless2": _get_aidatatang_200zh_pretrained_mode,  # noqa
     "luomingshuang/icefall_asr_alimeeting_pruned_transducer_stateless2": _get_alimeeting_pre_trained_model,  # noqa
     "csukuangfj/wenet-chinese-model": _get_wenet_model,
+    #  "csukuangfj/icefall-asr-wenetspeech-lstm-transducer-stateless-2022-10-14": _get_lstm_transducer_model,
 }
 english_models = {
     "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2": _get_gigaspeech_pre_trained_model,  # noqa
+    "WeijiZhuang/icefall-asr-librispeech-pruned-transducer-stateless8-2022-12-02": _get_english_model,  # noqa
+    "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless8-2022-11-14": _get_english_model,  # noqa
+    "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless7-2022-11-11": _get_english_model,  # noqa
+    "csukuangfj/icefall-asr-librispeech-pruned-transducer-stateless3-2022-05-13": _get_english_model,  # noqa
+    "videodanchik/icefall-asr-tedlium3-conformer-ctc2": _get_english_model,
+    "pkufool/icefall_asr_librispeech_conformer_ctc": _get_english_model,
+    "WayneWiser/icefall-asr-librispeech-conformer-ctc2-jit-bpe-500-2022-07-21": _get_english_model,
     "csukuangfj/wenet-english-model": _get_wenet_model,
 }
     "csukuangfj/wav2vec2.0-torchaudio": _get_german_pre_trained_model,
 }
+japanese_models = {
+    "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-fluent": _get_japanese_pre_trained_model,
+    "TeoWenShen/icefall-asr-csj-pruned-transducer-stateless7-streaming-230208-disfluent": _get_japanese_pre_trained_model,
+}
 all_models = {
     **chinese_models,
     **english_models,
     **chinese_english_mixed_models,
+    #  **japanese_models,
     **tibetan_models,
     **arabic_models,
     **german_models,
     "Chinese": list(chinese_models.keys()),
     "English": list(english_models.keys()),
     "Chinese+English": list(chinese_english_mixed_models.keys()),
+    #  "Japanese": list(japanese_models.keys()),
     "Tibetan": list(tibetan_models.keys()),
     "Arabic": list(arabic_models.keys()),
     "German": list(german_models.keys()),