diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..c2f3033fb212a52e8ef9feb867ab96cab146128c --- /dev/null +++ b/.gitignore @@ -0,0 +1,214 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +# Ignore +sub_tra.* +sub_ori.* +SPEAKER_00.* +SPEAKER_01.* +SPEAKER_02.* +SPEAKER_03.* +SPEAKER_04.* +SPEAKER_05.* +SPEAKER_06.* +SPEAKER_07.* +SPEAKER_08.* +SPEAKER_09.* +SPEAKER_10.* +SPEAKER_11.* +task_subtitle.* +*.mp3 +*.mp4 +*.ogg +*.wav +*.mkv +*.webm +*.avi +*.mpg +*.mov +*.ogv +*.wmv +test.py +list.txt +text_preprocessor.txt +text_translation.txt +*.srt +*.vtt +*.tsv +*.aud +*.ass +*.pt +.vscode/ +mdx_models/*.onnx +_XTTS_/ +downloads/ +logs/ +weights/ +clean_song_output/ +audio2/ +audio/ +outputs/ +processed/ +OPENVOICE_MODELS/ +PIPER_MODELS/ +WHISPER_MODELS/ +whisper_api_audio_parts/ +uroman/ +pdf_images/ \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/SoniTranslate_Colab.ipynb b/SoniTranslate_Colab.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..924016d60c1a26b791840a9d9e373d70697a77b2 --- /dev/null +++ b/SoniTranslate_Colab.ipynb @@ -0,0 +1,124 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# SoniTranslate\n", + "\n", + "| Description | Link |\n", + "| ----------- | ---- |\n", + "| 🎉 Repository | [![GitHub Repository](https://img.shields.io/badge/GitHub-Repository-black?style=flat-square&logo=github)](https://github.com/R3gm/SoniTranslate/) |\n", + "| 🚀 Online Demo in HF | [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/r3gm/SoniTranslate_translate_audio_of_a_video_content) |\n", + "\n", + "\n" + ], + "metadata": { + "id": "8lw0EgLex-YZ" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LUgwm0rfx0_J", + "cellView": "form" + }, + "outputs": [], + "source": [ + "# @title Install requirements for SoniTranslate\n", + "!git clone https://github.com/r3gm/SoniTranslate.git\n", + "%cd SoniTranslate\n", + "\n", + "!apt install git-lfs\n", + "!git lfs install\n", + "\n", + "!sed -i 's|git+https://github.com/R3gm/whisperX.git@cuda_11_8|git+https://github.com/R3gm/whisperX.git@cuda_12_x|' requirements_base.txt\n", + "!pip install -q -r requirements_base.txt\n", + "!pip install -q -r requirements_extra.txt\n", + "!pip install -q ort-nightly-gpu --index-url=https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ort-cuda-12-nightly/pypi/simple/\n", + "\n", + "Install_PIPER_TTS = True # @param {type:\"boolean\"}\n", + "\n", + "if Install_PIPER_TTS:\n", + " !pip install -q piper-tts==1.2.0\n", + "\n", + "Install_Coqui_XTTS = True # @param {type:\"boolean\"}\n", + "\n", + "if Install_Coqui_XTTS:\n", + " !pip install -q -r requirements_xtts.txt\n", + " !pip install -q TTS==0.21.1 --no-deps" + ] + }, + { + "cell_type": "markdown", + "source": [ + "One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation\n", + "\n", + "\n", + "\n", + "\n", + "Get your KEY TOKEN here: https://hf.co/settings/tokens" + ], + "metadata": { + "id": "LTaTstXPXNg2" + } + }, + { + "cell_type": "code", + "source": [ + "#@markdown # `RUN THE WEB APP`\n", + "YOUR_HF_TOKEN = \"\" #@param {type:'string'}\n", + "%env YOUR_HF_TOKEN={YOUR_HF_TOKEN}\n", + "theme = \"Taithrah/Minimal\" # @param [\"Taithrah/Minimal\", \"aliabid94/new-theme\", \"gstaff/xkcd\", \"ParityError/LimeFace\", \"abidlabs/pakistan\", \"rottenlittlecreature/Moon_Goblin\", \"ysharma/llamas\", \"gradio/dracula_revamped\"]\n", + "interface_language = \"english\" # @param ['arabic', 'azerbaijani', 'chinese_zh_cn', 'english', 'french', 'german', 'hindi', 'indonesian', 'italian', 'japanese', 'korean', 'marathi', 'polish', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish', 'ukrainian', 'vietnamese']\n", + "verbosity_level = \"info\" # @param [\"debug\", \"info\", \"warning\", \"error\", \"critical\"]\n", + "\n", + "\n", + "%cd /content/SoniTranslate\n", + "!python app_rvc.py --theme {theme} --verbosity_level {verbosity_level} --language {interface_language} --public_url" + ], + "metadata": { + "id": "XkhXfaFw4R4J", + "cellView": "form" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Open the `public URL` when it appears" + ], + "metadata": { + "id": "KJW3KrhZJh0u" + } + } + ] +} \ No newline at end of file diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..928f700cb2ce23283e8c67f63c49cecd544a13a0 --- /dev/null +++ b/app.py @@ -0,0 +1,2 @@ +import os +os.system("python app_rvc.py --language french --theme aliabid94/new-theme") \ No newline at end of file diff --git a/app_rvc.py b/app_rvc.py new file mode 100644 index 0000000000000000000000000000000000000000..c04a4d12a262c4c2d4bc591bcd138a35e726bc86 --- /dev/null +++ b/app_rvc.py @@ -0,0 +1,2884 @@ +import gradio as gr +import os +os.system("pip install -q piper-tts==1.2.0") +os.system("pip install -q -r requirements_xtts.txt") +os.system("pip install -q TTS==0.21.1 --no-deps") +import spaces +from soni_translate.logging_setup import ( + logger, + set_logging_level, + configure_logging_libs, +); configure_logging_libs() # noqa +import whisperx +import torch +import os +from soni_translate.audio_segments import create_translated_audio +from soni_translate.text_to_speech import ( + audio_segmentation_to_voice, + edge_tts_voices_list, + coqui_xtts_voices_list, + piper_tts_voices_list, + create_wav_file_vc, + accelerate_segments, +) +from soni_translate.translate_segments import ( + translate_text, + TRANSLATION_PROCESS_OPTIONS, + DOCS_TRANSLATION_PROCESS_OPTIONS +) +from soni_translate.preprocessor import ( + audio_video_preprocessor, + audio_preprocessor, +) +from soni_translate.postprocessor import ( + OUTPUT_TYPE_OPTIONS, + DOCS_OUTPUT_TYPE_OPTIONS, + sound_separate, + get_no_ext_filename, + media_out, + get_subtitle_speaker, +) +from soni_translate.language_configuration import ( + LANGUAGES, + UNIDIRECTIONAL_L_LIST, + LANGUAGES_LIST, + BARK_VOICES_LIST, + VITS_VOICES_LIST, + OPENAI_TTS_MODELS, +) +from soni_translate.utils import ( + remove_files, + download_list, + upload_model_list, + download_manager, + run_command, + is_audio_file, + is_subtitle_file, + copy_files, + get_valid_files, + get_link_list, + remove_directory_contents, +) +from soni_translate.mdx_net import ( + UVR_MODELS, + MDX_DOWNLOAD_LINK, + mdxnet_models_dir, +) +from soni_translate.speech_segmentation import ( + ASR_MODEL_OPTIONS, + COMPUTE_TYPE_GPU, + COMPUTE_TYPE_CPU, + find_whisper_models, + transcribe_speech, + align_speech, + diarize_speech, + diarization_models, +) +from soni_translate.text_multiformat_processor import ( + BORDER_COLORS, + srt_file_to_segments, + document_preprocessor, + determine_chunk_size, + plain_text_to_segments, + segments_to_plain_text, + process_subtitles, + linguistic_level_segments, + break_aling_segments, + doc_to_txtximg_pages, + page_data_to_segments, + update_page_data, + fix_timestamps_docs, + create_video_from_images, + merge_video_and_audio, +) +from soni_translate.languages_gui import language_data, news +import copy +import logging +import json +from pydub import AudioSegment +from voice_main import ClassVoices +import argparse +import time +import hashlib +import sys + +directories = [ + "downloads", + "logs", + "weights", + "clean_song_output", + "_XTTS_", + f"audio2{os.sep}audio", + "audio", + "outputs", +] +[ + os.makedirs(directory) + for directory in directories + if not os.path.exists(directory) +] + + +class TTS_Info: + def __init__(self, piper_enabled, xtts_enabled): + self.list_edge = edge_tts_voices_list() + self.list_bark = list(BARK_VOICES_LIST.keys()) + self.list_vits = list(VITS_VOICES_LIST.keys()) + self.list_openai_tts = OPENAI_TTS_MODELS + self.piper_enabled = piper_enabled + self.list_vits_onnx = ( + piper_tts_voices_list() if self.piper_enabled else [] + ) + self.xtts_enabled = xtts_enabled + + def tts_list(self): + self.list_coqui_xtts = ( + coqui_xtts_voices_list() if self.xtts_enabled else [] + ) + list_tts = self.list_coqui_xtts + sorted( + self.list_edge + + (self.list_bark if os.environ.get("ZERO_GPU") != "TRUE" else []) + + self.list_vits + + self.list_openai_tts + + self.list_vits_onnx + ) + return list_tts + + +def prog_disp(msg, percent, is_gui, progress=None): + logger.info(msg) + if is_gui: + progress(percent, desc=msg) + + +def warn_disp(wrn_lang, is_gui): + logger.warning(wrn_lang) + if is_gui: + gr.Warning(wrn_lang) + + +class SoniTrCache: + def __init__(self): + self.cache = { + 'media': [[]], + 'refine_vocals': [], + 'transcript_align': [], + 'break_align': [], + 'diarize': [], + 'translate': [], + 'subs_and_edit': [], + 'tts': [], + 'acc_and_vc': [], + 'mix_aud': [], + 'output': [] + } + + self.cache_data = { + 'media': [], + 'refine_vocals': [], + 'transcript_align': [], + 'break_align': [], + 'diarize': [], + 'translate': [], + 'subs_and_edit': [], + 'tts': [], + 'acc_and_vc': [], + 'mix_aud': [], + 'output': [] + } + + self.cache_keys = list(self.cache.keys()) + self.first_task = self.cache_keys[0] + self.last_task = self.cache_keys[-1] + + self.pre_step = None + self.pre_params = [] + + def set_variable(self, variable_name, value): + setattr(self, variable_name, value) + + def task_in_cache(self, step: str, params: list, previous_step_data: dict): + + self.pre_step_cache = None + + if step == self.first_task: + self.pre_step = None + + if self.pre_step: + self.cache[self.pre_step] = self.pre_params + + # Fill data in cache + self.cache_data[self.pre_step] = copy.deepcopy(previous_step_data) + + self.pre_params = params + # logger.debug(f"Step: {str(step)}, Cache params: {str(self.cache)}") + if params == self.cache[step]: + logger.debug(f"In cache: {str(step)}") + + # Set the var needed for next step + # Recovery from cache_data the current step + for key, value in self.cache_data[step].items(): + self.set_variable(key, copy.deepcopy(value)) + logger.debug( + f"Chache load: {str(key)}" + ) + + self.pre_step = step + return True + + else: + logger.debug(f"Flush next and caching {str(step)}") + selected_index = self.cache_keys.index(step) + + for idx, key in enumerate(self.cache.keys()): + if idx >= selected_index: + self.cache[key] = [] + self.cache_data[key] = {} + + # The last is now previous + self.pre_step = step + return False + + def clear_cache(self, media, force=False): + + self.cache["media"] = ( + self.cache["media"] if len(self.cache["media"]) else [[]] + ) + + if media != self.cache["media"][0] or force: + + # Clear cache + self.cache = {key: [] for key in self.cache} + self.cache["media"] = [[]] + + logger.info("Cache flushed") + + +def get_hash(filepath): + with open(filepath, 'rb') as f: + file_hash = hashlib.blake2b() + while chunk := f.read(8192): + file_hash.update(chunk) + + return file_hash.hexdigest()[:18] + + +def check_openai_api_key(): + if not os.environ.get("OPENAI_API_KEY"): + raise ValueError( + "To use GPT for translation, please set up your OpenAI API key " + "as an environment variable in Linux as follows: " + "export OPENAI_API_KEY='your-api-key-here'. Or change the " + "translation process in Advanced settings." + ) + + +class SoniTranslate(SoniTrCache): + def __init__(self, cpu_mode=False): + super().__init__() + if cpu_mode: + os.environ["SONITR_DEVICE"] = "cpu" + else: + os.environ["SONITR_DEVICE"] = ( + "cuda" if torch.cuda.is_available() else "cpu" + ) + + self.device = os.environ.get("SONITR_DEVICE") + self.result_diarize = None + self.align_language = None + self.result_source_lang = None + self.edit_subs_complete = False + self.voiceless_id = None + self.burn_subs_id = None + + self.vci = ClassVoices(only_cpu=cpu_mode) + + self.tts_voices = self.get_tts_voice_list() + + logger.info(f"Working in: {self.device}") + + def get_tts_voice_list(self): + try: + from piper import PiperVoice # noqa + + piper_enabled = True + logger.info("PIPER TTS enabled") + except Exception as error: + logger.debug(str(error)) + piper_enabled = False + logger.info("PIPER TTS disabled") + try: + from TTS.api import TTS # noqa + + xtts_enabled = True + logger.info("Coqui XTTS enabled") + logger.info( + "In this app, by using Coqui TTS (text-to-speech), you " + "acknowledge and agree to the license.\n" + "You confirm that you have read, understood, and agreed " + "to the Terms and Conditions specified at the following " + "link:\nhttps://coqui.ai/cpml.txt." + ) + os.environ["COQUI_TOS_AGREED"] = "1" + except Exception as error: + logger.debug(str(error)) + xtts_enabled = False + logger.info("Coqui XTTS disabled") + + self.tts_info = TTS_Info(piper_enabled, xtts_enabled) + + return self.tts_info.tts_list() + + def batch_multilingual_media_conversion(self, *kwargs): + # logger.debug(str(kwargs)) + + media_file_arg = kwargs[0] if kwargs[0] is not None else [] + + link_media_arg = kwargs[1] + link_media_arg = [x.strip() for x in link_media_arg.split(',')] + link_media_arg = get_link_list(link_media_arg) + + path_arg = kwargs[2] + path_arg = [x.strip() for x in path_arg.split(',')] + path_arg = get_valid_files(path_arg) + + edit_text_arg = kwargs[31] + get_text_arg = kwargs[32] + + is_gui_arg = kwargs[-1] + + kwargs = kwargs[3:] + + media_batch = media_file_arg + link_media_arg + path_arg + media_batch = list(filter(lambda x: x != "", media_batch)) + media_batch = media_batch if media_batch else [None] + logger.debug(str(media_batch)) + + remove_directory_contents("outputs") + + if edit_text_arg or get_text_arg: + return self.multilingual_media_conversion( + media_batch[0], "", "", *kwargs + ) + + if "SET_LIMIT" == os.getenv("DEMO") or "TRUE" == os.getenv("ZERO_GPU"): + media_batch = [media_batch[0]] + + result = [] + for media in media_batch: + # Call the nested function with the parameters + output_file = self.multilingual_media_conversion( + media, "", "", *kwargs + ) + + if isinstance(output_file, str): + output_file = [output_file] + result.extend(output_file) + + if is_gui_arg and len(media_batch) > 1: + gr.Info(f"Done: {os.path.basename(output_file[0])}") + + return result + + def multilingual_media_conversion( + self, + media_file=None, + link_media="", + directory_input="", + YOUR_HF_TOKEN="", + preview=False, + transcriber_model="large-v3", + batch_size=4, + compute_type="auto", + origin_language="Automatic detection", + target_language="English (en)", + min_speakers=1, + max_speakers=1, + tts_voice00="en-US-EmmaMultilingualNeural-Female", + tts_voice01="en-US-AndrewMultilingualNeural-Male", + tts_voice02="en-US-AvaMultilingualNeural-Female", + tts_voice03="en-US-BrianMultilingualNeural-Male", + tts_voice04="de-DE-SeraphinaMultilingualNeural-Female", + tts_voice05="de-DE-FlorianMultilingualNeural-Male", + tts_voice06="fr-FR-VivienneMultilingualNeural-Female", + tts_voice07="fr-FR-RemyMultilingualNeural-Male", + tts_voice08="en-US-EmmaMultilingualNeural-Female", + tts_voice09="en-US-AndrewMultilingualNeural-Male", + tts_voice10="en-US-EmmaMultilingualNeural-Female", + tts_voice11="en-US-AndrewMultilingualNeural-Male", + video_output_name="", + mix_method_audio="Adjusting volumes and mixing audio", + max_accelerate_audio=2.1, + acceleration_rate_regulation=False, + volume_original_audio=0.25, + volume_translated_audio=1.80, + output_format_subtitle="srt", + get_translated_text=False, + get_video_from_text_json=False, + text_json="{}", + avoid_overlap=False, + vocal_refinement=False, + literalize_numbers=True, + segment_duration_limit=15, + diarization_model="pyannote_2.1", + translate_process="google_translator_batch", + subtitle_file=None, + output_type="video (mp4)", + voiceless_track=False, + voice_imitation=False, + voice_imitation_max_segments=3, + voice_imitation_vocals_dereverb=False, + voice_imitation_remove_previous=True, + voice_imitation_method="freevc", + dereverb_automatic_xtts=True, + text_segmentation_scale="sentence", + divide_text_segments_by="", + soft_subtitles_to_video=True, + burn_subtitles_to_video=False, + enable_cache=True, + custom_voices=False, + custom_voices_workers=1, + is_gui=False, + progress=gr.Progress(), + ): + if not YOUR_HF_TOKEN: + YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN") + if diarization_model == "disable" or max_speakers == 1: + if YOUR_HF_TOKEN is None: + YOUR_HF_TOKEN = "" + elif not YOUR_HF_TOKEN: + raise ValueError("No valid Hugging Face token") + else: + os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN + + if ( + "gpt" in translate_process + or transcriber_model == "OpenAI_API_Whisper" + or "OpenAI-TTS" in tts_voice00 + ): + check_openai_api_key() + + if media_file is None: + media_file = ( + directory_input + if os.path.exists(directory_input) + else link_media + ) + media_file = ( + media_file if isinstance(media_file, str) else media_file.name + ) + + if is_subtitle_file(media_file): + subtitle_file = media_file + media_file = "" + + if media_file is None: + media_file = "" + + if not origin_language: + origin_language = "Automatic detection" + + if origin_language in UNIDIRECTIONAL_L_LIST and not subtitle_file: + raise ValueError( + f"The language '{origin_language}' " + "is not supported for transcription (ASR)." + ) + + if get_translated_text: + self.edit_subs_complete = False + if get_video_from_text_json: + if not self.edit_subs_complete: + raise ValueError("Generate the transcription first.") + + if ( + ("sound" in output_type or output_type == "raw media") + and (get_translated_text or get_video_from_text_json) + ): + raise ValueError( + "Please disable 'edit generate subtitles' " + f"first to acquire the {output_type}." + ) + + TRANSLATE_AUDIO_TO = LANGUAGES[target_language] + SOURCE_LANGUAGE = LANGUAGES[origin_language] + + if ( + transcriber_model == "OpenAI_API_Whisper" + and SOURCE_LANGUAGE == "zh-TW" + ): + logger.warning( + "OpenAI API Whisper only supports Chinese (Simplified)." + ) + SOURCE_LANGUAGE = "zh" + + if ( + text_segmentation_scale in ["word", "character"] + and "subtitle" not in output_type + ): + wrn_lang = ( + "Text segmentation by words or characters is typically" + " used for generating subtitles. If subtitles are not the" + " intended output, consider selecting 'sentence' " + "segmentation method to ensure optimal results." + + ) + warn_disp(wrn_lang, is_gui) + + if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower(): + wrn_lang = ( + "Make sure to select a 'TTS Speaker' suitable for" + " the translation language to avoid errors with the TTS." + ) + warn_disp(wrn_lang, is_gui) + + if "_XTTS_" in tts_voice00 and voice_imitation: + wrn_lang = ( + "When you select XTTS, it is advisable " + "to disable Voice Imitation." + ) + warn_disp(wrn_lang, is_gui) + + if custom_voices and voice_imitation: + wrn_lang = ( + "When you use R.V.C. models, it is advisable" + " to disable Voice Imitation." + ) + warn_disp(wrn_lang, is_gui) + + if not media_file and not subtitle_file: + raise ValueError( + "Specifify a media or SRT file in advanced settings" + ) + + if subtitle_file: + subtitle_file = ( + subtitle_file + if isinstance(subtitle_file, str) + else subtitle_file.name + ) + + if subtitle_file and SOURCE_LANGUAGE == "Automatic detection": + raise Exception( + "To use an SRT file, you need to specify its " + "original language (Source language)" + ) + + if not media_file and subtitle_file: + diarization_model = "disable" + media_file = "audio_support.wav" + if not get_video_from_text_json: + remove_files(media_file) + srt_data = srt_file_to_segments(subtitle_file) + total_duration = srt_data["segments"][-1]["end"] + 30. + support_audio = AudioSegment.silent( + duration=int(total_duration * 1000) + ) + support_audio.export( + media_file, format="wav" + ) + logger.info("Supporting audio for the SRT file, created.") + + if "SET_LIMIT" == os.getenv("DEMO"): + preview = True + mix_method_audio = "Adjusting volumes and mixing audio" + transcriber_model = "medium" + logger.info( + "DEMO; set preview=True; Generation is limited to " + "10 seconds to prevent CPU errors. No limitations with GPU.\n" + "DEMO; set Adjusting volumes and mixing audio\n" + "DEMO; set whisper model to medium" + ) + + # Check GPU + if self.device == "cpu" and compute_type not in COMPUTE_TYPE_CPU: + logger.info("Compute type changed to float32") + compute_type = "float32" + + base_video_file = "Video.mp4" + base_audio_wav = "audio.wav" + dub_audio_file = "audio_dub_solo.ogg" + vocals_audio_file = "audio_Vocals_DeReverb.wav" + voiceless_audio_file = "audio_Voiceless.wav" + mix_audio_file = "audio_mix.mp3" + vid_subs = "video_subs_file.mp4" + video_output_file = "video_dub.mp4" + + if os.path.exists(media_file): + media_base_hash = get_hash(media_file) + else: + media_base_hash = media_file + self.clear_cache(media_base_hash, force=(not enable_cache)) + + if not get_video_from_text_json: + self.result_diarize = ( + self.align_language + ) = self.result_source_lang = None + if not self.task_in_cache("media", [media_base_hash, preview], {}): + if is_audio_file(media_file): + prog_disp( + "Processing audio...", 0.15, is_gui, progress=progress + ) + audio_preprocessor(preview, media_file, base_audio_wav) + else: + prog_disp( + "Processing video...", 0.15, is_gui, progress=progress + ) + audio_video_preprocessor( + preview, media_file, base_video_file, base_audio_wav + ) + logger.debug("Set file complete.") + + if "sound" in output_type: + prog_disp( + "Separating sounds in the file...", + 0.50, + is_gui, + progress=progress + ) + separate_out = sound_separate(base_audio_wav, output_type) + final_outputs = [] + for out in separate_out: + final_name = media_out( + media_file, + f"{get_no_ext_filename(out)}", + video_output_name, + "wav", + file_obj=out, + ) + final_outputs.append(final_name) + logger.info(f"Done: {str(final_outputs)}") + return final_outputs + + if output_type == "raw media": + output = media_out( + media_file, + "raw_media", + video_output_name, + "wav" if is_audio_file(media_file) else "mp4", + file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file, + ) + logger.info(f"Done: {output}") + return output + + if not self.task_in_cache("refine_vocals", [vocal_refinement], {}): + self.vocals = None + if vocal_refinement: + try: + from soni_translate.mdx_net import process_uvr_task + _, _, _, _, file_vocals = process_uvr_task( + orig_song_path=base_audio_wav, + main_vocals=False, + dereverb=True, + remove_files_output_dir=True, + ) + remove_files(vocals_audio_file) + copy_files(file_vocals, ".") + self.vocals = vocals_audio_file + except Exception as error: + logger.error(str(error)) + + if not self.task_in_cache("transcript_align", [ + subtitle_file, + SOURCE_LANGUAGE, + transcriber_model, + compute_type, + batch_size, + literalize_numbers, + segment_duration_limit, + ( + "l_unit" + if text_segmentation_scale in ["word", "character"] + and subtitle_file + else "sentence" + ) + ], {"vocals": self.vocals}): + if subtitle_file: + prog_disp( + "From SRT file...", 0.30, is_gui, progress=progress + ) + audio = whisperx.load_audio( + base_audio_wav if not self.vocals else self.vocals + ) + self.result = srt_file_to_segments(subtitle_file) + self.result["language"] = SOURCE_LANGUAGE + else: + prog_disp( + "Transcribing...", 0.30, is_gui, progress=progress + ) + SOURCE_LANGUAGE = ( + None + if SOURCE_LANGUAGE == "Automatic detection" + else SOURCE_LANGUAGE + ) + audio, self.result = transcribe_speech( + base_audio_wav if not self.vocals else self.vocals, + transcriber_model, + compute_type, + batch_size, + SOURCE_LANGUAGE, + literalize_numbers, + segment_duration_limit, + ) + logger.debug( + "Transcript complete, " + f"segments count {len(self.result['segments'])}" + ) + + self.align_language = self.result["language"] + if ( + not subtitle_file + or text_segmentation_scale in ["word", "character"] + ): + prog_disp("Aligning...", 0.45, is_gui, progress=progress) + try: + if self.align_language in ["vi"]: + logger.info( + "Deficient alignment for the " + f"{self.align_language} language, skipping the" + " process. It is suggested to reduce the " + "duration of the segments as an alternative." + ) + else: + self.result = align_speech(audio, self.result) + logger.debug( + "Align complete, " + f"segments count {len(self.result['segments'])}" + ) + except Exception as error: + logger.error(str(error)) + + if self.result["segments"] == []: + raise ValueError("No active speech found in audio") + + if not self.task_in_cache("break_align", [ + divide_text_segments_by, + text_segmentation_scale, + self.align_language + ], { + "result": self.result, + "align_language": self.align_language + }): + if self.align_language in ["ja", "zh", "zh-TW"]: + divide_text_segments_by += "|!|?|...|。" + if text_segmentation_scale in ["word", "character"]: + self.result = linguistic_level_segments( + self.result, + text_segmentation_scale, + ) + elif divide_text_segments_by: + try: + self.result = break_aling_segments( + self.result, + break_characters=divide_text_segments_by, + ) + except Exception as error: + logger.error(str(error)) + + if not self.task_in_cache("diarize", [ + min_speakers, + max_speakers, + YOUR_HF_TOKEN[:len(YOUR_HF_TOKEN)//2], + diarization_model + ], { + "result": self.result + }): + prog_disp("Diarizing...", 0.60, is_gui, progress=progress) + diarize_model_select = diarization_models[diarization_model] + self.result_diarize = diarize_speech( + base_audio_wav if not self.vocals else self.vocals, + self.result, + min_speakers, + max_speakers, + YOUR_HF_TOKEN, + diarize_model_select, + ) + logger.debug("Diarize complete") + self.result_source_lang = copy.deepcopy(self.result_diarize) + + if not self.task_in_cache("translate", [ + TRANSLATE_AUDIO_TO, + translate_process + ], { + "result_diarize": self.result_diarize + }): + prog_disp("Translating...", 0.70, is_gui, progress=progress) + lang_source = ( + self.align_language + if self.align_language + else SOURCE_LANGUAGE + ) + self.result_diarize["segments"] = translate_text( + self.result_diarize["segments"], + TRANSLATE_AUDIO_TO, + translate_process, + chunk_size=1800, + source=lang_source, + ) + logger.debug("Translation complete") + logger.debug(self.result_diarize) + + if get_translated_text: + + json_data = [] + for segment in self.result_diarize["segments"]: + start = segment["start"] + text = segment["text"] + speaker = int(segment.get("speaker", "SPEAKER_00")[-2:]) + 1 + json_data.append( + {"start": start, "text": text, "speaker": speaker} + ) + + # Convert list of dictionaries to a JSON string with indentation + json_string = json.dumps(json_data, indent=2) + logger.info("Done") + self.edit_subs_complete = True + return json_string.encode().decode("unicode_escape") + + if get_video_from_text_json: + + if self.result_diarize is None: + raise ValueError("Generate the transcription first.") + # with open('text_json.json', 'r') as file: + text_json_loaded = json.loads(text_json) + for i, segment in enumerate(self.result_diarize["segments"]): + segment["text"] = text_json_loaded[i]["text"] + segment["speaker"] = "SPEAKER_{:02d}".format( + int(text_json_loaded[i]["speaker"]) - 1 + ) + + # Write subtitle + if not self.task_in_cache("subs_and_edit", [ + copy.deepcopy(self.result_diarize), + output_format_subtitle, + TRANSLATE_AUDIO_TO + ], { + "result_diarize": self.result_diarize + }): + if output_format_subtitle == "disable": + self.sub_file = "sub_tra.srt" + elif output_format_subtitle != "ass": + self.sub_file = process_subtitles( + self.result_source_lang, + self.align_language, + self.result_diarize, + output_format_subtitle, + TRANSLATE_AUDIO_TO, + ) + + # Need task + if output_format_subtitle != "srt": + _ = process_subtitles( + self.result_source_lang, + self.align_language, + self.result_diarize, + "srt", + TRANSLATE_AUDIO_TO, + ) + + if output_format_subtitle == "ass": + convert_ori = "ffmpeg -i sub_ori.srt sub_ori.ass -y" + convert_tra = "ffmpeg -i sub_tra.srt sub_tra.ass -y" + self.sub_file = "sub_tra.ass" + run_command(convert_ori) + run_command(convert_tra) + + format_sub = ( + output_format_subtitle + if output_format_subtitle != "disable" + else "srt" + ) + + if output_type == "subtitle": + + out_subs = [] + tra_subs = media_out( + media_file, + TRANSLATE_AUDIO_TO, + video_output_name, + format_sub, + file_obj=self.sub_file, + ) + out_subs.append(tra_subs) + + ori_subs = media_out( + media_file, + self.align_language, + video_output_name, + format_sub, + file_obj=f"sub_ori.{format_sub}", + ) + out_subs.append(ori_subs) + logger.info(f"Done: {out_subs}") + return out_subs + + if output_type == "subtitle [by speaker]": + output = get_subtitle_speaker( + media_file, + result=self.result_diarize, + language=TRANSLATE_AUDIO_TO, + extension=format_sub, + base_name=video_output_name, + ) + logger.info(f"Done: {str(output)}") + return output + + if "video [subtitled]" in output_type: + output = media_out( + media_file, + TRANSLATE_AUDIO_TO + "_subtitled", + video_output_name, + "wav" if is_audio_file(media_file) else ( + "mkv" if "mkv" in output_type else "mp4" + ), + file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file, + soft_subtitles=False if is_audio_file(media_file) else True, + subtitle_files=output_format_subtitle, + ) + msg_out = output[0] if isinstance(output, list) else output + logger.info(f"Done: {msg_out}") + return output + + if not self.task_in_cache("tts", [ + TRANSLATE_AUDIO_TO, + tts_voice00, + tts_voice01, + tts_voice02, + tts_voice03, + tts_voice04, + tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, + dereverb_automatic_xtts + ], { + "sub_file": self.sub_file + }): + prog_disp("Text to speech...", 0.80, is_gui, progress=progress) + self.valid_speakers = audio_segmentation_to_voice( + self.result_diarize, + TRANSLATE_AUDIO_TO, + is_gui, + tts_voice00, + tts_voice01, + tts_voice02, + tts_voice03, + tts_voice04, + tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, + dereverb_automatic_xtts, + ) + + if not self.task_in_cache("acc_and_vc", [ + max_accelerate_audio, + acceleration_rate_regulation, + voice_imitation, + voice_imitation_max_segments, + voice_imitation_remove_previous, + voice_imitation_vocals_dereverb, + voice_imitation_method, + custom_voices, + custom_voices_workers, + copy.deepcopy(self.vci.model_config), + avoid_overlap + ], { + "valid_speakers": self.valid_speakers + }): + audio_files, speakers_list = accelerate_segments( + self.result_diarize, + max_accelerate_audio, + self.valid_speakers, + acceleration_rate_regulation, + ) + + # Voice Imitation (Tone color converter) + if voice_imitation: + prog_disp( + "Voice Imitation...", 0.85, is_gui, progress=progress + ) + from soni_translate.text_to_speech import toneconverter + + try: + toneconverter( + copy.deepcopy(self.result_diarize), + voice_imitation_max_segments, + voice_imitation_remove_previous, + voice_imitation_vocals_dereverb, + voice_imitation_method, + ) + except Exception as error: + logger.error(str(error)) + + # custom voice + if custom_voices: + prog_disp( + "Applying customized voices...", + 0.90, + is_gui, + progress=progress, + ) + + try: + self.vci( + audio_files, + speakers_list, + overwrite=True, + parallel_workers=custom_voices_workers, + ) + self.vci.unload_models() + except Exception as error: + logger.error(str(error)) + + prog_disp( + "Creating final translated video...", + 0.95, + is_gui, + progress=progress, + ) + remove_files(dub_audio_file) + create_translated_audio( + self.result_diarize, + audio_files, + dub_audio_file, + False, + avoid_overlap, + ) + + # Voiceless track, change with file + hash_base_audio_wav = get_hash(base_audio_wav) + if voiceless_track: + if self.voiceless_id != hash_base_audio_wav: + from soni_translate.mdx_net import process_uvr_task + + try: + # voiceless_audio_file_dir = "clean_song_output/voiceless" + remove_files(voiceless_audio_file) + uvr_voiceless_audio_wav, _ = process_uvr_task( + orig_song_path=base_audio_wav, + song_id="voiceless", + only_voiceless=True, + remove_files_output_dir=False, + ) + copy_files(uvr_voiceless_audio_wav, ".") + base_audio_wav = voiceless_audio_file + self.voiceless_id = hash_base_audio_wav + + except Exception as error: + logger.error(str(error)) + else: + base_audio_wav = voiceless_audio_file + + if not self.task_in_cache("mix_aud", [ + mix_method_audio, + volume_original_audio, + volume_translated_audio, + voiceless_track + ], {}): + # TYPE MIX AUDIO + remove_files(mix_audio_file) + command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume={volume_original_audio}[a];[1:0]volume={volume_translated_audio}[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}' + command_background_mix = f'ffmpeg -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio_file}' + if mix_method_audio == "Adjusting volumes and mixing audio": + # volume mix + run_command(command_volume_mix) + else: + try: + # background mix + run_command(command_background_mix) + except Exception as error_mix: + # volume mix except + logger.error(str(error_mix)) + run_command(command_volume_mix) + + if "audio" in output_type or is_audio_file(media_file): + output = media_out( + media_file, + TRANSLATE_AUDIO_TO, + video_output_name, + "wav" if "wav" in output_type else ( + "ogg" if "ogg" in output_type else "mp3" + ), + file_obj=mix_audio_file, + subtitle_files=output_format_subtitle, + ) + msg_out = output[0] if isinstance(output, list) else output + logger.info(f"Done: {msg_out}") + return output + + hash_base_video_file = get_hash(base_video_file) + + if burn_subtitles_to_video: + hashvideo_text = [ + hash_base_video_file, + [seg["text"] for seg in self.result_diarize["segments"]] + ] + if self.burn_subs_id != hashvideo_text: + try: + logger.info("Burn subtitles") + remove_files(vid_subs) + command = f"ffmpeg -i {base_video_file} -y -vf subtitles=sub_tra.srt -max_muxing_queue_size 9999 {vid_subs}" + run_command(command) + base_video_file = vid_subs + self.burn_subs_id = hashvideo_text + except Exception as error: + logger.error(str(error)) + else: + base_video_file = vid_subs + + if not self.task_in_cache("output", [ + hash_base_video_file, + hash_base_audio_wav, + burn_subtitles_to_video + ], {}): + # Merge new audio + video + remove_files(video_output_file) + run_command( + f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output_file}" + ) + + output = media_out( + media_file, + TRANSLATE_AUDIO_TO, + video_output_name, + "mkv" if "mkv" in output_type else "mp4", + file_obj=video_output_file, + soft_subtitles=soft_subtitles_to_video, + subtitle_files=output_format_subtitle, + ) + msg_out = output[0] if isinstance(output, list) else output + logger.info(f"Done: {msg_out}") + + return output + + def hook_beta_processor( + self, + document, + tgt_lang, + translate_process, + ori_lang, + tts, + name_final_file, + custom_voices, + custom_voices_workers, + output_type, + chunk_size, + width, + height, + start_page, + end_page, + bcolor, + is_gui, + progress + ): + prog_disp("Processing pages...", 0.10, is_gui, progress=progress) + doc_data = doc_to_txtximg_pages(document, width, height, start_page, end_page, bcolor) + result_diarize = page_data_to_segments(doc_data, 1700) + + prog_disp("Translating...", 0.20, is_gui, progress=progress) + result_diarize["segments"] = translate_text( + result_diarize["segments"], + tgt_lang, + translate_process, + chunk_size=0, + source=ori_lang, + ) + chunk_size = ( + chunk_size if chunk_size else determine_chunk_size(tts) + ) + doc_data = update_page_data(result_diarize, doc_data) + + prog_disp("Text to speech...", 0.30, is_gui, progress=progress) + result_diarize = page_data_to_segments(doc_data, chunk_size) + valid_speakers = audio_segmentation_to_voice( + result_diarize, + tgt_lang, + is_gui, + tts, + ) + + # fix format and set folder output + audio_files, speakers_list = accelerate_segments( + result_diarize, + 1.0, + valid_speakers, + ) + + # custom voice + if custom_voices: + prog_disp( + "Applying customized voices...", + 0.60, + is_gui, + progress=progress, + ) + self.vci( + audio_files, + speakers_list, + overwrite=True, + parallel_workers=custom_voices_workers, + ) + self.vci.unload_models() + + # Update time segments and not concat + result_diarize = fix_timestamps_docs(result_diarize, audio_files) + final_wav_file = "audio_book.wav" + remove_files(final_wav_file) + + prog_disp("Creating audio file...", 0.70, is_gui, progress=progress) + create_translated_audio( + result_diarize, audio_files, final_wav_file, False + ) + + prog_disp("Creating video file...", 0.80, is_gui, progress=progress) + video_doc = create_video_from_images( + doc_data, + result_diarize + ) + + # Merge video and audio + prog_disp("Merging...", 0.90, is_gui, progress=progress) + vid_out = merge_video_and_audio(video_doc, final_wav_file) + + # End + output = media_out( + document, + tgt_lang, + name_final_file, + "mkv" if "mkv" in output_type else "mp4", + file_obj=vid_out, + ) + logger.info(f"Done: {output}") + return output + + def multilingual_docs_conversion( + self, + string_text="", # string + document=None, # doc path gui + directory_input="", # doc path + origin_language="English (en)", + target_language="English (en)", + tts_voice00="en-US-EmmaMultilingualNeural-Female", + name_final_file="", + translate_process="google_translator", + output_type="audio", + chunk_size=None, + custom_voices=False, + custom_voices_workers=1, + start_page=1, + end_page=99999, + width=1280, + height=720, + bcolor="dynamic", + is_gui=False, + progress=gr.Progress(), + ): + if "gpt" in translate_process: + check_openai_api_key() + + SOURCE_LANGUAGE = LANGUAGES[origin_language] + if translate_process != "disable_translation": + TRANSLATE_AUDIO_TO = LANGUAGES[target_language] + else: + TRANSLATE_AUDIO_TO = SOURCE_LANGUAGE + logger.info("No translation") + if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower(): + logger.debug( + "Make sure to select a 'TTS Speaker' suitable for the " + "translation language to avoid errors with the TTS." + ) + + self.clear_cache(string_text, force=True) + + is_string = False + if document is None: + if os.path.exists(directory_input): + document = directory_input + else: + document = string_text + is_string = True + document = document if isinstance(document, str) else document.name + if not document: + raise Exception("No data found") + + if "videobook" in output_type: + if not document.lower().endswith(".pdf"): + raise ValueError( + "Videobooks are only compatible with PDF files." + ) + + return self.hook_beta_processor( + document, + TRANSLATE_AUDIO_TO, + translate_process, + SOURCE_LANGUAGE, + tts_voice00, + name_final_file, + custom_voices, + custom_voices_workers, + output_type, + chunk_size, + width, + height, + start_page, + end_page, + bcolor, + is_gui, + progress + ) + + # audio_wav = "audio.wav" + final_wav_file = "audio_book.wav" + + prog_disp("Processing text...", 0.15, is_gui, progress=progress) + result_file_path, result_text = document_preprocessor( + document, is_string, start_page, end_page + ) + + if ( + output_type == "book (txt)" + and translate_process == "disable_translation" + ): + return result_file_path + + if "SET_LIMIT" == os.getenv("DEMO"): + result_text = result_text[:50] + logger.info( + "DEMO; Generation is limited to 50 characters to prevent " + "CPU errors. No limitations with GPU.\n" + ) + + if translate_process != "disable_translation": + # chunks text for translation + result_diarize = plain_text_to_segments(result_text, 1700) + prog_disp("Translating...", 0.30, is_gui, progress=progress) + # not or iterative with 1700 chars + result_diarize["segments"] = translate_text( + result_diarize["segments"], + TRANSLATE_AUDIO_TO, + translate_process, + chunk_size=0, + source=SOURCE_LANGUAGE, + ) + + txt_file_path, result_text = segments_to_plain_text(result_diarize) + + if output_type == "book (txt)": + return media_out( + result_file_path if is_string else document, + TRANSLATE_AUDIO_TO, + name_final_file, + "txt", + file_obj=txt_file_path, + ) + + # (TTS limits) plain text to result_diarize + chunk_size = ( + chunk_size if chunk_size else determine_chunk_size(tts_voice00) + ) + result_diarize = plain_text_to_segments(result_text, chunk_size) + logger.debug(result_diarize) + + prog_disp("Text to speech...", 0.45, is_gui, progress=progress) + valid_speakers = audio_segmentation_to_voice( + result_diarize, + TRANSLATE_AUDIO_TO, + is_gui, + tts_voice00, + ) + + # fix format and set folder output + audio_files, speakers_list = accelerate_segments( + result_diarize, + 1.0, + valid_speakers, + ) + + # custom voice + if custom_voices: + prog_disp( + "Applying customized voices...", + 0.80, + is_gui, + progress=progress, + ) + self.vci( + audio_files, + speakers_list, + overwrite=True, + parallel_workers=custom_voices_workers, + ) + self.vci.unload_models() + + prog_disp( + "Creating final audio file...", 0.90, is_gui, progress=progress + ) + remove_files(final_wav_file) + create_translated_audio( + result_diarize, audio_files, final_wav_file, True + ) + + output = media_out( + result_file_path if is_string else document, + TRANSLATE_AUDIO_TO, + name_final_file, + "mp3" if "mp3" in output_type else ( + "ogg" if "ogg" in output_type else "wav" + ), + file_obj=final_wav_file, + ) + + logger.info(f"Done: {output}") + + return output + + +title = "
📽️ SoniTranslate 🈷️
" + + +def create_gui(theme, logs_in_gui=False): + with gr.Blocks(theme=theme) as app: + gr.Markdown(title) + gr.Markdown(lg_conf["description"]) + + if os.environ.get("ZERO_GPU") == "TRUE": + gr.Markdown( + """ + +
+ ⚠️ Important ⚠️ + +
+ """ + ) + + with gr.Tab(lg_conf["tab_translate"]): + with gr.Row(): + with gr.Column(): + input_data_type = gr.Dropdown( + ["SUBMIT VIDEO", "URL", "Find Video Path"], + value="SUBMIT VIDEO", + label=lg_conf["video_source"], + ) + + def swap_visibility(data_type): + if data_type == "URL": + return ( + gr.update(visible=False, value=None), + gr.update(visible=True, value=""), + gr.update(visible=False, value=""), + ) + elif data_type == "SUBMIT VIDEO": + return ( + gr.update(visible=True, value=None), + gr.update(visible=False, value=""), + gr.update(visible=False, value=""), + ) + elif data_type == "Find Video Path": + return ( + gr.update(visible=False, value=None), + gr.update(visible=False, value=""), + gr.update(visible=True, value=""), + ) + + video_input = gr.File( + label="VIDEO", + file_count="multiple", + type="filepath", + ) + blink_input = gr.Textbox( + visible=False, + label=lg_conf["link_label"], + info=lg_conf["link_info"], + placeholder=lg_conf["link_ph"], + ) + directory_input = gr.Textbox( + visible=False, + label=lg_conf["dir_label"], + info=lg_conf["dir_info"], + placeholder=lg_conf["dir_ph"], + ) + input_data_type.change( + fn=swap_visibility, + inputs=input_data_type, + outputs=[video_input, blink_input, directory_input], + ) + + gr.HTML() + + SOURCE_LANGUAGE = gr.Dropdown( + LANGUAGES_LIST, + value=LANGUAGES_LIST[0], + label=lg_conf["sl_label"], + info=lg_conf["sl_info"], + ) + TRANSLATE_AUDIO_TO = gr.Dropdown( + LANGUAGES_LIST[1:], + value="English (en)", + label=lg_conf["tat_label"], + info=lg_conf["tat_info"], + ) + + gr.HTML("
") + + gr.Markdown(lg_conf["num_speakers"]) + MAX_TTS = 12 + min_speakers = gr.Slider( + 1, + MAX_TTS, + value=1, + label=lg_conf["min_sk"], + step=1, + visible=False, + ) + max_speakers = gr.Slider( + 1, + MAX_TTS, + value=2, + step=1, + label=lg_conf["max_sk"], + ) + gr.Markdown(lg_conf["tts_select"]) + + def submit(value): + visibility_dict = { + f"tts_voice{i:02d}": gr.update(visible=i < value) + for i in range(MAX_TTS) + } + return [value for value in visibility_dict.values()] + + tts_voice00 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-US-EmmaMultilingualNeural-Female", + label=lg_conf["sk1"], + visible=True, + interactive=True, + ) + tts_voice01 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-US-AndrewMultilingualNeural-Male", + label=lg_conf["sk2"], + visible=True, + interactive=True, + ) + tts_voice02 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-US-AvaMultilingualNeural-Female", + label=lg_conf["sk3"], + visible=False, + interactive=True, + ) + tts_voice03 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-US-BrianMultilingualNeural-Male", + label=lg_conf["sk4"], + visible=False, + interactive=True, + ) + tts_voice04 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="de-DE-SeraphinaMultilingualNeural-Female", + label=lg_conf["sk4"], + visible=False, + interactive=True, + ) + tts_voice05 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="de-DE-FlorianMultilingualNeural-Male", + label=lg_conf["sk6"], + visible=False, + interactive=True, + ) + tts_voice06 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="fr-FR-VivienneMultilingualNeural-Female", + label=lg_conf["sk7"], + visible=False, + interactive=True, + ) + tts_voice07 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="fr-FR-RemyMultilingualNeural-Male", + label=lg_conf["sk8"], + visible=False, + interactive=True, + ) + tts_voice08 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-US-EmmaMultilingualNeural-Female", + label=lg_conf["sk9"], + visible=False, + interactive=True, + ) + tts_voice09 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-US-AndrewMultilingualNeural-Male", + label=lg_conf["sk10"], + visible=False, + interactive=True, + ) + tts_voice10 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-US-EmmaMultilingualNeural-Female", + label=lg_conf["sk11"], + visible=False, + interactive=True, + ) + tts_voice11 = gr.Dropdown( + SoniTr.tts_info.tts_list(), + value="en-US-AndrewMultilingualNeural-Male", + label=lg_conf["sk12"], + visible=False, + interactive=True, + ) + max_speakers.change( + submit, + max_speakers, + [ + tts_voice00, + tts_voice01, + tts_voice02, + tts_voice03, + tts_voice04, + tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, + ], + ) + + with gr.Column(): + with gr.Accordion( + lg_conf["vc_title"], + open=False, + ): + gr.Markdown(lg_conf["vc_subtitle"]) + voice_imitation_gui = gr.Checkbox( + False, + label=lg_conf["vc_active_label"], + info=lg_conf["vc_active_info"], + ) + openvoice_models = ["openvoice", "openvoice_v2"] + voice_imitation_method_options = ( + ["freevc"] + openvoice_models + if SoniTr.tts_info.xtts_enabled + else openvoice_models + ) + voice_imitation_method_gui = gr.Dropdown( + voice_imitation_method_options, + value=voice_imitation_method_options[0], + label=lg_conf["vc_method_label"], + info=lg_conf["vc_method_info"], + ) + voice_imitation_max_segments_gui = gr.Slider( + label=lg_conf["vc_segments_label"], + info=lg_conf["vc_segments_info"], + value=3, + step=1, + minimum=1, + maximum=10, + visible=True, + interactive=True, + ) + voice_imitation_vocals_dereverb_gui = gr.Checkbox( + False, + label=lg_conf["vc_dereverb_label"], + info=lg_conf["vc_dereverb_info"], + ) + voice_imitation_remove_previous_gui = gr.Checkbox( + True, + label=lg_conf["vc_remove_label"], + info=lg_conf["vc_remove_info"], + ) + + if SoniTr.tts_info.xtts_enabled: + with gr.Column(): + with gr.Accordion( + lg_conf["xtts_title"], + open=False, + ): + gr.Markdown(lg_conf["xtts_subtitle"]) + wav_speaker_file = gr.File( + label=lg_conf["xtts_file_label"] + ) + wav_speaker_name = gr.Textbox( + label=lg_conf["xtts_name_label"], + value="", + info=lg_conf["xtts_name_info"], + placeholder="default_name", + lines=1, + ) + wav_speaker_start = gr.Number( + label="Time audio start", + value=0, + visible=False, + ) + wav_speaker_end = gr.Number( + label="Time audio end", + value=0, + visible=False, + ) + wav_speaker_dir = gr.Textbox( + label="Directory save", + value="_XTTS_", + visible=False, + ) + wav_speaker_dereverb = gr.Checkbox( + True, + label=lg_conf["xtts_dereverb_label"], + info=lg_conf["xtts_dereverb_info"] + ) + wav_speaker_output = gr.HTML() + create_xtts_wav = gr.Button( + lg_conf["xtts_button"] + ) + gr.Markdown(lg_conf["xtts_footer"]) + else: + wav_speaker_dereverb = gr.Checkbox( + False, + label=lg_conf["xtts_dereverb_label"], + info=lg_conf["xtts_dereverb_info"], + visible=False + ) + + with gr.Column(): + with gr.Accordion( + lg_conf["extra_setting"], open=False + ): + audio_accelerate = gr.Slider( + label=lg_conf["acc_max_label"], + value=1.9, + step=0.1, + minimum=1.0, + maximum=2.5, + visible=True, + interactive=True, + info=lg_conf["acc_max_info"], + ) + acceleration_rate_regulation_gui = gr.Checkbox( + False, + label=lg_conf["acc_rate_label"], + info=lg_conf["acc_rate_info"], + ) + avoid_overlap_gui = gr.Checkbox( + False, + label=lg_conf["or_label"], + info=lg_conf["or_info"], + ) + + gr.HTML("
") + + audio_mix_options = [ + "Mixing audio with sidechain compression", + "Adjusting volumes and mixing audio", + ] + AUDIO_MIX = gr.Dropdown( + audio_mix_options, + value=audio_mix_options[1], + label=lg_conf["aud_mix_label"], + info=lg_conf["aud_mix_info"], + ) + volume_original_mix = gr.Slider( + label=lg_conf["vol_ori"], + info="for Adjusting volumes and mixing audio", + value=0.25, + step=0.05, + minimum=0.0, + maximum=2.50, + visible=True, + interactive=True, + ) + volume_translated_mix = gr.Slider( + label=lg_conf["vol_tra"], + info="for Adjusting volumes and mixing audio", + value=1.80, + step=0.05, + minimum=0.0, + maximum=2.50, + visible=True, + interactive=True, + ) + main_voiceless_track = gr.Checkbox( + label=lg_conf["voiceless_tk_label"], + info=lg_conf["voiceless_tk_info"], + ) + + gr.HTML("
") + sub_type_options = [ + "disable", + "srt", + "vtt", + "ass", + "txt", + "tsv", + "json", + "aud", + ] + + sub_type_output = gr.Dropdown( + sub_type_options, + value=sub_type_options[1], + label=lg_conf["sub_type"], + ) + soft_subtitles_to_video_gui = gr.Checkbox( + label=lg_conf["soft_subs_label"], + info=lg_conf["soft_subs_info"], + ) + burn_subtitles_to_video_gui = gr.Checkbox( + label=lg_conf["burn_subs_label"], + info=lg_conf["burn_subs_info"], + ) + + gr.HTML("
") + gr.Markdown(lg_conf["whisper_title"]) + literalize_numbers_gui = gr.Checkbox( + True, + label=lg_conf["lnum_label"], + info=lg_conf["lnum_info"], + ) + vocal_refinement_gui = gr.Checkbox( + False, + label=lg_conf["scle_label"], + info=lg_conf["scle_info"], + ) + segment_duration_limit_gui = gr.Slider( + label=lg_conf["sd_limit_label"], + info=lg_conf["sd_limit_info"], + value=15, + step=1, + minimum=1, + maximum=30, + ) + whisper_model_default = ( + "large-v3" + if SoniTr.device == "cuda" + else "medium" + ) + + WHISPER_MODEL_SIZE = gr.Dropdown( + ASR_MODEL_OPTIONS + find_whisper_models(), + value=whisper_model_default, + label="Whisper ASR model", + info=lg_conf["asr_model_info"], + allow_custom_value=True, + ) + com_t_opt, com_t_default = ( + [COMPUTE_TYPE_GPU, "float16"] + if SoniTr.device == "cuda" + else [COMPUTE_TYPE_CPU, "float32"] + ) + compute_type = gr.Dropdown( + com_t_opt, + value=com_t_default, + label=lg_conf["ctype_label"], + info=lg_conf["ctype_info"], + ) + batch_size = gr.Slider( + minimum=1, + maximum=32, + value=8, + label=lg_conf["batchz_label"], + info=lg_conf["batchz_info"], + step=1, + ) + input_srt = gr.File( + label=lg_conf["srt_file_label"], + file_types=[".srt", ".ass", ".vtt"], + height=130, + ) + + gr.HTML("
") + text_segmentation_options = [ + "sentence", + "word", + "character" + ] + text_segmentation_scale_gui = gr.Dropdown( + text_segmentation_options, + value=text_segmentation_options[0], + label=lg_conf["tsscale_label"], + info=lg_conf["tsscale_info"], + ) + divide_text_segments_by_gui = gr.Textbox( + label=lg_conf["divide_text_label"], + value="", + info=lg_conf["divide_text_info"], + ) + + gr.HTML("
") + pyannote_models_list = list( + diarization_models.keys() + ) + diarization_process_dropdown = gr.Dropdown( + pyannote_models_list, + value=pyannote_models_list[1], + label=lg_conf["diarization_label"], + ) + translate_process_dropdown = gr.Dropdown( + TRANSLATION_PROCESS_OPTIONS, + value=TRANSLATION_PROCESS_OPTIONS[0], + label=lg_conf["tr_process_label"], + ) + + gr.HTML("
") + main_output_type = gr.Dropdown( + OUTPUT_TYPE_OPTIONS, + value=OUTPUT_TYPE_OPTIONS[0], + label=lg_conf["out_type_label"], + ) + VIDEO_OUTPUT_NAME = gr.Textbox( + label=lg_conf["out_name_label"], + value="", + info=lg_conf["out_name_info"], + ) + play_sound_gui = gr.Checkbox( + True, + label=lg_conf["task_sound_label"], + info=lg_conf["task_sound_info"], + ) + enable_cache_gui = gr.Checkbox( + True, + label=lg_conf["cache_label"], + info=lg_conf["cache_info"], + ) + PREVIEW = gr.Checkbox( + label="Preview", info=lg_conf["preview_info"] + ) + is_gui_dummy_check = gr.Checkbox( + True, visible=False + ) + + with gr.Column(variant="compact"): + edit_sub_check = gr.Checkbox( + label=lg_conf["edit_sub_label"], + info=lg_conf["edit_sub_info"], + ) + dummy_false_check = gr.Checkbox( + False, + visible=False, + ) + + def visible_component_subs(input_bool): + if input_bool: + return gr.update(visible=True), gr.update( + visible=True + ) + else: + return gr.update(visible=False), gr.update( + visible=False + ) + + subs_button = gr.Button( + lg_conf["button_subs"], + variant="primary", + visible=False, + ) + subs_edit_space = gr.Textbox( + visible=False, + lines=10, + label=lg_conf["editor_sub_label"], + info=lg_conf["editor_sub_info"], + placeholder=lg_conf["editor_sub_ph"], + ) + edit_sub_check.change( + visible_component_subs, + [edit_sub_check], + [subs_button, subs_edit_space], + ) + + with gr.Row(): + video_button = gr.Button( + lg_conf["button_translate"], + variant="primary", + ) + with gr.Row(): + video_output = gr.File( + label=lg_conf["output_result_label"], + file_count="multiple", + interactive=False, + + ) # gr.Video() + + gr.HTML("
") + + if ( + os.getenv("YOUR_HF_TOKEN") is None + or os.getenv("YOUR_HF_TOKEN") == "" + ): + HFKEY = gr.Textbox( + visible=True, + label="HF Token", + info=lg_conf["ht_token_info"], + placeholder=lg_conf["ht_token_ph"], + ) + else: + HFKEY = gr.Textbox( + visible=False, + label="HF Token", + info=lg_conf["ht_token_info"], + placeholder=lg_conf["ht_token_ph"], + ) + + gr.Examples( + examples=[ + [ + ["./assets/Video_main.mp4"], + "", + "", + "", + False, + whisper_model_default, + 4, + com_t_default, + "Spanish (es)", + "English (en)", + 1, + 2, + "en-CA-ClaraNeural-Female", + "en-AU-WilliamNeural-Male", + ], + ], # no update + fn=SoniTr.batch_multilingual_media_conversion, + inputs=[ + video_input, + blink_input, + directory_input, + HFKEY, + PREVIEW, + WHISPER_MODEL_SIZE, + batch_size, + compute_type, + SOURCE_LANGUAGE, + TRANSLATE_AUDIO_TO, + min_speakers, + max_speakers, + tts_voice00, + tts_voice01, + ], + outputs=[video_output], + cache_examples=False, + ) + + with gr.Tab(lg_conf["tab_docs"]): + with gr.Column(): + with gr.Accordion("Docs", open=True): + with gr.Column(variant="compact"): + with gr.Column(): + input_doc_type = gr.Dropdown( + [ + "WRITE TEXT", + "SUBMIT DOCUMENT", + "Find Document Path", + ], + value="SUBMIT DOCUMENT", + label=lg_conf["docs_input_label"], + info=lg_conf["docs_input_info"], + ) + + def swap_visibility(data_type): + if data_type == "WRITE TEXT": + return ( + gr.update(visible=True, value=""), + gr.update(visible=False, value=None), + gr.update(visible=False, value=""), + ) + elif data_type == "SUBMIT DOCUMENT": + return ( + gr.update(visible=False, value=""), + gr.update(visible=True, value=None), + gr.update(visible=False, value=""), + ) + elif data_type == "Find Document Path": + return ( + gr.update(visible=False, value=""), + gr.update(visible=False, value=None), + gr.update(visible=True, value=""), + ) + + text_docs = gr.Textbox( + label="Text", + value="This is an example", + info="Write a text", + placeholder="...", + lines=5, + visible=False, + ) + input_docs = gr.File( + label="Document", visible=True + ) + directory_input_docs = gr.Textbox( + visible=False, + label="Document Path", + info="Example: /home/my_doc.pdf", + placeholder="Path goes here...", + ) + input_doc_type.change( + fn=swap_visibility, + inputs=input_doc_type, + outputs=[ + text_docs, + input_docs, + directory_input_docs, + ], + ) + + gr.HTML() + + tts_documents = gr.Dropdown( + list( + filter( + lambda x: x != "_XTTS_/AUTOMATIC.wav", + SoniTr.tts_info.tts_list(), + ) + ), + value="en-US-EmmaMultilingualNeural-Female", + label="TTS", + visible=True, + interactive=True, + ) + + gr.HTML() + + docs_SOURCE_LANGUAGE = gr.Dropdown( + LANGUAGES_LIST[1:], + value="English (en)", + label=lg_conf["sl_label"], + info=lg_conf["docs_source_info"], + ) + docs_TRANSLATE_TO = gr.Dropdown( + LANGUAGES_LIST[1:], + value="English (en)", + label=lg_conf["tat_label"], + info=lg_conf["tat_info"], + ) + + with gr.Column(): + with gr.Accordion( + lg_conf["extra_setting"], open=False + ): + docs_translate_process_dropdown = gr.Dropdown( + DOCS_TRANSLATION_PROCESS_OPTIONS, + value=DOCS_TRANSLATION_PROCESS_OPTIONS[ + 0 + ], + label="Translation process", + ) + + gr.HTML("
") + + docs_output_type = gr.Dropdown( + DOCS_OUTPUT_TYPE_OPTIONS, + value=DOCS_OUTPUT_TYPE_OPTIONS[2], + label="Output type", + ) + docs_OUTPUT_NAME = gr.Textbox( + label="Final file name", + value="", + info=lg_conf["out_name_info"], + ) + docs_chunk_size = gr.Number( + label=lg_conf["chunk_size_label"], + value=0, + visible=True, + interactive=True, + info=lg_conf["chunk_size_info"], + ) + gr.HTML("
") + start_page_gui = gr.Number( + step=1, + value=1, + minimum=1, + maximum=99999, + label="Start page", + ) + end_page_gui = gr.Number( + step=1, + value=99999, + minimum=1, + maximum=99999, + label="End page", + ) + gr.HTML("
Videobook config") + videobook_width_gui = gr.Number( + step=1, + value=1280, + minimum=100, + maximum=4096, + label="Width", + ) + videobook_height_gui = gr.Number( + step=1, + value=720, + minimum=100, + maximum=4096, + label="Height", + ) + videobook_bcolor_gui = gr.Dropdown( + BORDER_COLORS, + value=BORDER_COLORS[0], + label="Border color", + ) + docs_dummy_check = gr.Checkbox( + True, visible=False + ) + + with gr.Row(): + docs_button = gr.Button( + lg_conf["docs_button"], + variant="primary", + ) + with gr.Row(): + docs_output = gr.File( + label="Result", + interactive=False, + ) + + with gr.Tab("Custom voice R.V.C. (Optional)"): + + with gr.Column(): + with gr.Accordion("Get the R.V.C. Models", open=True): + url_links = gr.Textbox( + label="URLs", + value="", + info=lg_conf["cv_url_info"], + placeholder="urls here...", + lines=1, + ) + download_finish = gr.HTML() + download_button = gr.Button("DOWNLOAD MODELS") + + def update_models(): + models_path, index_path = upload_model_list() + + dict_models = { + f"fmodel{i:02d}": gr.update( + choices=models_path + ) + for i in range(MAX_TTS+1) + } + dict_index = { + f"findex{i:02d}": gr.update( + choices=index_path, value=None + ) + for i in range(MAX_TTS+1) + } + dict_changes = {**dict_models, **dict_index} + return [value for value in dict_changes.values()] + + with gr.Column(): + with gr.Accordion(lg_conf["replace_title"], open=False): + with gr.Column(variant="compact"): + with gr.Column(): + gr.Markdown(lg_conf["sec1_title"]) + enable_custom_voice = gr.Checkbox( + False, + label="ENABLE", + info=lg_conf["enable_replace"] + ) + workers_custom_voice = gr.Number( + step=1, + value=1, + minimum=1, + maximum=50, + label="workers", + visible=False, + ) + + gr.Markdown(lg_conf["sec2_title"]) + gr.Markdown(lg_conf["sec2_subtitle"]) + + PITCH_ALGO_OPT = [ + "pm", + "harvest", + "crepe", + "rmvpe", + "rmvpe+", + ] + + def model_conf(): + return gr.Dropdown( + models_path, + # value="", + label="Model", + visible=True, + interactive=True, + ) + + def pitch_algo_conf(): + return gr.Dropdown( + PITCH_ALGO_OPT, + value=PITCH_ALGO_OPT[3], + label="Pitch algorithm", + visible=True, + interactive=True, + ) + + def pitch_lvl_conf(): + return gr.Slider( + label="Pitch level", + minimum=-24, + maximum=24, + step=1, + value=0, + visible=True, + interactive=True, + ) + + def index_conf(): + return gr.Dropdown( + index_path, + value=None, + label="Index", + visible=True, + interactive=True, + ) + + def index_inf_conf(): + return gr.Slider( + minimum=0, + maximum=1, + label="Index influence", + value=0.75, + ) + + def respiration_filter_conf(): + return gr.Slider( + minimum=0, + maximum=7, + label="Respiration median filtering", + value=3, + step=1, + interactive=True, + ) + + def envelope_ratio_conf(): + return gr.Slider( + minimum=0, + maximum=1, + label="Envelope ratio", + value=0.25, + interactive=True, + ) + + def consonant_protec_conf(): + return gr.Slider( + minimum=0, + maximum=0.5, + label="Consonant breath protection", + value=0.5, + interactive=True, + ) + + def button_conf(tts_name): + return gr.Button( + lg_conf["cv_button_apply"]+" "+tts_name, + variant="primary", + ) + + TTS_TABS = [ + 'TTS Speaker {:02d}'.format(i) for i in range(1, MAX_TTS+1) + ] + + CV_SUBTITLES = [ + lg_conf["cv_tts1"], + lg_conf["cv_tts2"], + lg_conf["cv_tts3"], + lg_conf["cv_tts4"], + lg_conf["cv_tts5"], + lg_conf["cv_tts6"], + lg_conf["cv_tts7"], + lg_conf["cv_tts8"], + lg_conf["cv_tts9"], + lg_conf["cv_tts10"], + lg_conf["cv_tts11"], + lg_conf["cv_tts12"], + ] + + configs_storage = [] + + for i in range(MAX_TTS): # Loop from 00 to 11 + with gr.Accordion(CV_SUBTITLES[i], open=False): + gr.Markdown(TTS_TABS[i]) + with gr.Column(): + tag_gui = gr.Textbox( + value=TTS_TABS[i], visible=False + ) + model_gui = model_conf() + pitch_algo_gui = pitch_algo_conf() + pitch_lvl_gui = pitch_lvl_conf() + index_gui = index_conf() + index_inf_gui = index_inf_conf() + rmf_gui = respiration_filter_conf() + er_gui = envelope_ratio_conf() + cbp_gui = consonant_protec_conf() + + with gr.Row(variant="compact"): + button_config = button_conf( + TTS_TABS[i] + ) + + confirm_conf = gr.HTML() + + button_config.click( + SoniTr.vci.apply_conf, + inputs=[ + tag_gui, + model_gui, + pitch_algo_gui, + pitch_lvl_gui, + index_gui, + index_inf_gui, + rmf_gui, + er_gui, + cbp_gui, + ], + outputs=[confirm_conf], + ) + + configs_storage.append({ + "tag": tag_gui, + "model": model_gui, + "index": index_gui, + }) + + with gr.Column(): + with gr.Accordion("Test R.V.C.", open=False): + with gr.Row(variant="compact"): + text_test = gr.Textbox( + label="Text", + value="This is an example", + info="write a text", + placeholder="...", + lines=5, + ) + with gr.Column(): + tts_test = gr.Dropdown( + sorted(SoniTr.tts_info.list_edge), + value="en-GB-ThomasNeural-Male", + label="TTS", + visible=True, + interactive=True, + ) + model_test = model_conf() + index_test = index_conf() + pitch_test = pitch_lvl_conf() + pitch_alg_test = pitch_algo_conf() + with gr.Row(variant="compact"): + button_test = gr.Button("Test audio") + + with gr.Column(): + with gr.Row(): + original_ttsvoice = gr.Audio() + ttsvoice = gr.Audio() + + button_test.click( + SoniTr.vci.make_test, + inputs=[ + text_test, + tts_test, + model_test, + index_test, + pitch_test, + pitch_alg_test, + ], + outputs=[ttsvoice, original_ttsvoice], + ) + + download_button.click( + download_list, + [url_links], + [download_finish], + queue=False + ).then( + update_models, + [], + [ + elem["model"] for elem in configs_storage + ] + [model_test] + [ + elem["index"] for elem in configs_storage + ] + [index_test], + ) + + with gr.Tab(lg_conf["tab_help"]): + gr.Markdown(lg_conf["tutorial"]) + gr.Markdown(news) + + def play_sound_alert(play_sound): + + if not play_sound: + return None + + # silent_sound = "assets/empty_audio.mp3" + sound_alert = "assets/sound_alert.mp3" + + time.sleep(0.25) + # yield silent_sound + yield None + + time.sleep(0.25) + yield sound_alert + + sound_alert_notification = gr.Audio( + value=None, + type="filepath", + format="mp3", + autoplay=True, + visible=False, + ) + + if logs_in_gui: + logger.info("Logs in gui need public url") + + class Logger: + def __init__(self, filename): + self.terminal = sys.stdout + self.log = open(filename, "w") + + def write(self, message): + self.terminal.write(message) + self.log.write(message) + + def flush(self): + self.terminal.flush() + self.log.flush() + + def isatty(self): + return False + + sys.stdout = Logger("output.log") + + def read_logs(): + sys.stdout.flush() + with open("output.log", "r") as f: + return f.read() + + with gr.Accordion("Logs", open=False): + logs = gr.Textbox(label=">>>") + app.load(read_logs, None, logs, every=1) + + if SoniTr.tts_info.xtts_enabled: + # Update tts list + def update_tts_list(): + update_dict = { + f"tts_voice{i:02d}": gr.update(choices=SoniTr.tts_info.tts_list()) + for i in range(MAX_TTS) + } + update_dict["tts_documents"] = gr.update( + choices=list( + filter( + lambda x: x != "_XTTS_/AUTOMATIC.wav", + SoniTr.tts_info.tts_list(), + ) + ) + ) + return [value for value in update_dict.values()] + + create_xtts_wav.click( + create_wav_file_vc, + inputs=[ + wav_speaker_name, + wav_speaker_file, + wav_speaker_start, + wav_speaker_end, + wav_speaker_dir, + wav_speaker_dereverb, + ], + outputs=[wav_speaker_output], + ).then( + update_tts_list, + None, + [ + tts_voice00, + tts_voice01, + tts_voice02, + tts_voice03, + tts_voice04, + tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, + tts_documents, + ], + ) + + # Run translate text + subs_button.click( + SoniTr.batch_multilingual_media_conversion, + inputs=[ + video_input, + blink_input, + directory_input, + HFKEY, + PREVIEW, + WHISPER_MODEL_SIZE, + batch_size, + compute_type, + SOURCE_LANGUAGE, + TRANSLATE_AUDIO_TO, + min_speakers, + max_speakers, + tts_voice00, + tts_voice01, + tts_voice02, + tts_voice03, + tts_voice04, + tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, + VIDEO_OUTPUT_NAME, + AUDIO_MIX, + audio_accelerate, + acceleration_rate_regulation_gui, + volume_original_mix, + volume_translated_mix, + sub_type_output, + edit_sub_check, # TRUE BY DEFAULT + dummy_false_check, # dummy false + subs_edit_space, + avoid_overlap_gui, + vocal_refinement_gui, + literalize_numbers_gui, + segment_duration_limit_gui, + diarization_process_dropdown, + translate_process_dropdown, + input_srt, + main_output_type, + main_voiceless_track, + voice_imitation_gui, + voice_imitation_max_segments_gui, + voice_imitation_vocals_dereverb_gui, + voice_imitation_remove_previous_gui, + voice_imitation_method_gui, + wav_speaker_dereverb, + text_segmentation_scale_gui, + divide_text_segments_by_gui, + soft_subtitles_to_video_gui, + burn_subtitles_to_video_gui, + enable_cache_gui, + enable_custom_voice, + workers_custom_voice, + is_gui_dummy_check, + ], + outputs=subs_edit_space, + ).then( + play_sound_alert, [play_sound_gui], [sound_alert_notification] + ) + + # Run translate tts and complete + video_button.click( + SoniTr.batch_multilingual_media_conversion, + inputs=[ + video_input, + blink_input, + directory_input, + HFKEY, + PREVIEW, + WHISPER_MODEL_SIZE, + batch_size, + compute_type, + SOURCE_LANGUAGE, + TRANSLATE_AUDIO_TO, + min_speakers, + max_speakers, + tts_voice00, + tts_voice01, + tts_voice02, + tts_voice03, + tts_voice04, + tts_voice05, + tts_voice06, + tts_voice07, + tts_voice08, + tts_voice09, + tts_voice10, + tts_voice11, + VIDEO_OUTPUT_NAME, + AUDIO_MIX, + audio_accelerate, + acceleration_rate_regulation_gui, + volume_original_mix, + volume_translated_mix, + sub_type_output, + dummy_false_check, + edit_sub_check, + subs_edit_space, + avoid_overlap_gui, + vocal_refinement_gui, + literalize_numbers_gui, + segment_duration_limit_gui, + diarization_process_dropdown, + translate_process_dropdown, + input_srt, + main_output_type, + main_voiceless_track, + voice_imitation_gui, + voice_imitation_max_segments_gui, + voice_imitation_vocals_dereverb_gui, + voice_imitation_remove_previous_gui, + voice_imitation_method_gui, + wav_speaker_dereverb, + text_segmentation_scale_gui, + divide_text_segments_by_gui, + soft_subtitles_to_video_gui, + burn_subtitles_to_video_gui, + enable_cache_gui, + enable_custom_voice, + workers_custom_voice, + is_gui_dummy_check, + ], + outputs=video_output, + trigger_mode="multiple", + ).then( + play_sound_alert, [play_sound_gui], [sound_alert_notification] + ) + + # Run docs process + docs_button.click( + SoniTr.multilingual_docs_conversion, + inputs=[ + text_docs, + input_docs, + directory_input_docs, + docs_SOURCE_LANGUAGE, + docs_TRANSLATE_TO, + tts_documents, + docs_OUTPUT_NAME, + docs_translate_process_dropdown, + docs_output_type, + docs_chunk_size, + enable_custom_voice, + workers_custom_voice, + start_page_gui, + end_page_gui, + videobook_width_gui, + videobook_height_gui, + videobook_bcolor_gui, + docs_dummy_check, + ], + outputs=docs_output, + trigger_mode="multiple", + ).then( + play_sound_alert, [play_sound_gui], [sound_alert_notification] + ) + + return app + + +def get_language_config(language_data, language=None, base_key="english"): + base_lang = language_data.get(base_key) + + if language not in language_data: + logger.error( + f"Language {language} not found, defaulting to {base_key}" + ) + return base_lang + + lg_conf = language_data.get(language, {}) + lg_conf.update((k, v) for k, v in base_lang.items() if k not in lg_conf) + + return lg_conf + + +def create_parser(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "--theme", + type=str, + default="Taithrah/Minimal", + help=( + "Specify the theme; find themes in " + "https://huggingface.co/spaces/gradio/theme-gallery;" + " Example: --theme aliabid94/new-theme" + ), + ) + parser.add_argument( + "--public_url", + action="store_true", + default=False, + help="Enable public link", + ) + parser.add_argument( + "--logs_in_gui", + action="store_true", + default=False, + help="Displays the operations performed in Logs", + ) + parser.add_argument( + "--verbosity_level", + type=str, + default="info", + help=( + "Set logger verbosity level: " + "debug, info, warning, error, or critical" + ), + ) + parser.add_argument( + "--language", + type=str, + default="english", + help=" Select the language of the interface: english, spanish", + ) + parser.add_argument( + "--cpu_mode", + action="store_true", + default=False, + help="Enable CPU mode to run the program without utilizing GPU acceleration.", + ) + return parser + + +if __name__ == "__main__": + + parser = create_parser() + + args = parser.parse_args() + # Simulating command-line arguments + # args_list = "--theme aliabid94/new-theme --public_url".split() + # args = parser.parse_args(args_list) + + set_logging_level(args.verbosity_level) + + for id_model in UVR_MODELS: + download_manager( + os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir + ) + + models_path, index_path = upload_model_list() + + SoniTr = SoniTranslate(cpu_mode=args.cpu_mode if os.environ.get("ZERO_GPU") != "TRUE" else "cpu") + + lg_conf = get_language_config(language_data, language=args.language) + + app = create_gui(args.theme, logs_in_gui=args.logs_in_gui) + + app.queue() + + app.launch( + max_threads=1, + share=args.public_url, + show_error=True, + quiet=False, + debug=(True if logger.isEnabledFor(logging.DEBUG) else False), + ) diff --git a/assets/logo.jpeg b/assets/logo.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..7c4e0375add91a53f6e509833b19e5ce3c322fd9 Binary files /dev/null and b/assets/logo.jpeg differ diff --git a/docs/windows_install.md b/docs/windows_install.md new file mode 100644 index 0000000000000000000000000000000000000000..7bad658a0aa9b5cca1a7ff157b5bba2fe297153d --- /dev/null +++ b/docs/windows_install.md @@ -0,0 +1,150 @@ +## Install Locally Windows + +### Before You Start + +Before you start installing and using SoniTranslate, there are a few things you need to do: + +1. Install Microsoft Visual C++ Build Tools, MSVC and Windows 10 SDK: + + * Go to the [Visual Studio downloads page](https://visualstudio.microsoft.com/visual-cpp-build-tools/); Or maybe you already have **Visual Studio Installer**? Open it. If you have it already click modify. + * Download and install the "Build Tools for Visual Studio" if you don't have it. + * During installation, under "Workloads", select "C++ build tools" and ensure the latest versions of "MSVCv142 - VS 2019 C++ x64/x86 build tools" and "Windows 10 SDK" are selected ("Windows 11 SDK" if you are using Windows 11); OR go to individual components and find those two listed. + * Complete the installation. + +2. Verify the NVIDIA driver on Windows using the command line: + + * **Open Command Prompt:** Press `Win + R`, type `cmd`, then press `Enter`. + + * **Type the command:** `nvidia-smi` and press `Enter`. + + * **Look for "CUDA Version"** in the output. + +``` ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 522.25 Driver Version: 522.25 CUDA Version: 11.8 | +|-------------------------------+----------------------+----------------------+ +``` + +3. If you see that your CUDA version is less than 11.8, you should update your NVIDIA driver. Visit the NVIDIA website's driver download page (https://www.nvidia.com/Download/index.aspx) and enter your graphics card information. + +4. Accept the license agreement for using Pyannote. You need to have an account on Hugging Face and `accept the license to use the models`: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation +5. Create a [huggingface token](https://huggingface.co/settings/tokens). Hugging Face is a natural language processing platform that provides access to state-of-the-art models and tools. You will need to create a token in order to use some of the automatic model download features in SoniTranslate. Follow the instructions on the Hugging Face website to create a token. +6. Install [Anaconda](https://www.anaconda.com/) or [Miniconda](https://docs.anaconda.com/free/miniconda/miniconda-install/). Anaconda is a free and open-source distribution of Python and R. It includes a package manager called conda that makes it easy to install and manage Python environments and packages. Follow the instructions on the Anaconda website to download and install Anaconda on your system. +7. Install Git for your system. Git is a version control system that helps you track changes to your code and collaborate with other developers. You can install Git with Anaconda by running `conda install -c anaconda git -y` in your terminal (Do this after step 1 in the following section.). If you have trouble installing Git via Anaconda, you can use the following link instead: + - [Git for Windows](https://git-scm.com/download/win) + +Once you have completed these steps, you will be ready to install SoniTranslate. + +### Getting Started + +To install SoniTranslate, follow these steps: + +1. Create a suitable anaconda environment for SoniTranslate and activate it: + +``` +conda create -n sonitr python=3.10 -y +conda activate sonitr +``` + +2. Clone this github repository and navigate to it: +``` +git clone https://github.com/r3gm/SoniTranslate.git +cd SoniTranslate +``` +3. Install CUDA Toolkit 11.8.0 + +``` +conda install -c "nvidia/label/cuda-11.8.0" cuda-toolkit -y +``` + +4. Install PyTorch using conda +``` +conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y +``` + +5. Install required packages: + +``` +pip install -r requirements_base.txt -v +pip install -r requirements_extra.txt -v +pip install onnxruntime-gpu +``` + +6. Install [ffmpeg](https://ffmpeg.org/download.html). FFmpeg is a free software project that produces libraries and programs for handling multimedia data. You will need it to process audio and video files. You can install ffmpeg with Anaconda by running `conda install -y ffmpeg` in your terminal (recommended). If you have trouble installing ffmpeg via Anaconda, you can use the following link instead: (https://ffmpeg.org/ffmpeg.html). Once it is installed, make sure it is in your PATH by running `ffmpeg -h` in your terminal. If you don't get an error message, you're good to go. + +7. Optional install: + +After installing FFmpeg, you can install these optional packages. + +[Coqui XTTS](https://github.com/coqui-ai/TTS) is a text-to-speech (TTS) model that lets you generate realistic voices in different languages. It can clone voices with just a short audio clip, even speak in a different language! It's like having a personal voice mimic for any text you need spoken. + +``` +pip install -q -r requirements_xtts.txt +pip install -q TTS==0.21.1 --no-deps +``` + +[Piper TTS](https://github.com/rhasspy/piper) is a fast, local neural text to speech system that sounds great and is optimized for the Raspberry Pi 4. Piper is used in a variety of projects. Voices are trained with VITS and exported to the onnxruntime. + +🚧 For Windows users, it's important to note that the Python module piper-tts is not fully supported on this operating system. While it works smoothly on Linux, Windows compatibility is currently experimental. If you still wish to install it on Windows, you can follow this experimental method: + +``` +pip install https://github.com/R3gm/piper-phonemize/releases/download/1.2.0/piper_phonemize-1.2.0-cp310-cp310-win_amd64.whl +pip install sherpa-onnx==1.9.12 +pip install piper-tts==1.2.0 --no-deps +``` + +8. Setting your [Hugging Face token](https://huggingface.co/settings/tokens) as an environment variable in quotes: + +``` +conda env config vars set YOUR_HF_TOKEN="YOUR_HUGGING_FACE_TOKEN_HERE" +conda deactivate +``` + + +### Running SoniTranslate + +To run SoniTranslate locally, make sure the `sonitr` conda environment is active: + +``` +conda activate sonitr +``` + +Then navigate to the `SoniTranslate` folder and run either the `app_rvc.py` + +``` +python app_rvc.py +``` +When the `local URL` `http://127.0.0.1:7860` is displayed in the terminal, simply open this URL in your web browser to access the SoniTranslate interface. + +### Stop and close SoniTranslate. + +In most environments, you can stop the execution by pressing Ctrl+C in the terminal where you launched the script `app_rvc.py`. This will interrupt the program and stop the Gradio app. +To deactivate the Conda environment, you can use the following command: + +``` +conda deactivate +``` + +This will deactivate the currently active Conda environment sonitr, and you'll return to the base environment or the global Python environment. + +### Starting Over + +If you need to start over from scratch, you can delete the `SoniTranslate` folder and remove the `sonitr` conda environment with the following set of commands: + +``` +conda deactivate +conda env remove -n sonitr +``` + +With the `sonitr` environment removed, you can start over with a fresh installation. + +### Notes +- To use OpenAI's GPT API for translation, set up your OpenAI API key as an environment variable in quotes: + +``` +conda activate sonitr +conda env config vars set OPENAI_API_KEY="your-api-key-here" +conda deactivate +``` + +- Alternatively, you can install the CUDA Toolkit 11.8.0 directly on your system [CUDA Toolkit 11.8.0](https://developer.nvidia.com/cuda-11-8-0-download-archive). \ No newline at end of file diff --git a/lib/audio.py b/lib/audio.py new file mode 100644 index 0000000000000000000000000000000000000000..5efe06b165341164b8c7761e6f5afc112928c551 --- /dev/null +++ b/lib/audio.py @@ -0,0 +1,21 @@ +import ffmpeg +import numpy as np + + +def load_audio(file, sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + file = ( + file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # To prevent beginners from copying paths with leading or trailing spaces, quotation marks, and line breaks. + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except Exception as e: + raise RuntimeError(f"Failed to load audio: {e}") + + return np.frombuffer(out, np.float32).flatten() diff --git a/lib/infer_pack/attentions.py b/lib/infer_pack/attentions.py new file mode 100644 index 0000000000000000000000000000000000000000..84d5c8730d4facf7ac9951d0339ab71256d7dc09 --- /dev/null +++ b/lib/infer_pack/attentions.py @@ -0,0 +1,417 @@ +import copy +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from lib.infer_pack import commons +from lib.infer_pack import modules +from lib.infer_pack.modules import LayerNorm + + +class Encoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + window_size=10, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.window_size = window_size + + self.drop = nn.Dropout(p_dropout) + self.attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + window_size=window_size, + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask): + attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.attn_layers[i](x, x, attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class Decoder(nn.Module): + def __init__( + self, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size=1, + p_dropout=0.0, + proximal_bias=False, + proximal_init=True, + **kwargs + ): + super().__init__() + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + + self.drop = nn.Dropout(p_dropout) + self.self_attn_layers = nn.ModuleList() + self.norm_layers_0 = nn.ModuleList() + self.encdec_attn_layers = nn.ModuleList() + self.norm_layers_1 = nn.ModuleList() + self.ffn_layers = nn.ModuleList() + self.norm_layers_2 = nn.ModuleList() + for i in range(self.n_layers): + self.self_attn_layers.append( + MultiHeadAttention( + hidden_channels, + hidden_channels, + n_heads, + p_dropout=p_dropout, + proximal_bias=proximal_bias, + proximal_init=proximal_init, + ) + ) + self.norm_layers_0.append(LayerNorm(hidden_channels)) + self.encdec_attn_layers.append( + MultiHeadAttention( + hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout + ) + ) + self.norm_layers_1.append(LayerNorm(hidden_channels)) + self.ffn_layers.append( + FFN( + hidden_channels, + hidden_channels, + filter_channels, + kernel_size, + p_dropout=p_dropout, + causal=True, + ) + ) + self.norm_layers_2.append(LayerNorm(hidden_channels)) + + def forward(self, x, x_mask, h, h_mask): + """ + x: decoder input + h: encoder output + """ + self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to( + device=x.device, dtype=x.dtype + ) + encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1) + x = x * x_mask + for i in range(self.n_layers): + y = self.self_attn_layers[i](x, x, self_attn_mask) + y = self.drop(y) + x = self.norm_layers_0[i](x + y) + + y = self.encdec_attn_layers[i](x, h, encdec_attn_mask) + y = self.drop(y) + x = self.norm_layers_1[i](x + y) + + y = self.ffn_layers[i](x, x_mask) + y = self.drop(y) + x = self.norm_layers_2[i](x + y) + x = x * x_mask + return x + + +class MultiHeadAttention(nn.Module): + def __init__( + self, + channels, + out_channels, + n_heads, + p_dropout=0.0, + window_size=None, + heads_share=True, + block_length=None, + proximal_bias=False, + proximal_init=False, + ): + super().__init__() + assert channels % n_heads == 0 + + self.channels = channels + self.out_channels = out_channels + self.n_heads = n_heads + self.p_dropout = p_dropout + self.window_size = window_size + self.heads_share = heads_share + self.block_length = block_length + self.proximal_bias = proximal_bias + self.proximal_init = proximal_init + self.attn = None + + self.k_channels = channels // n_heads + self.conv_q = nn.Conv1d(channels, channels, 1) + self.conv_k = nn.Conv1d(channels, channels, 1) + self.conv_v = nn.Conv1d(channels, channels, 1) + self.conv_o = nn.Conv1d(channels, out_channels, 1) + self.drop = nn.Dropout(p_dropout) + + if window_size is not None: + n_heads_rel = 1 if heads_share else n_heads + rel_stddev = self.k_channels**-0.5 + self.emb_rel_k = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + self.emb_rel_v = nn.Parameter( + torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) + * rel_stddev + ) + + nn.init.xavier_uniform_(self.conv_q.weight) + nn.init.xavier_uniform_(self.conv_k.weight) + nn.init.xavier_uniform_(self.conv_v.weight) + if proximal_init: + with torch.no_grad(): + self.conv_k.weight.copy_(self.conv_q.weight) + self.conv_k.bias.copy_(self.conv_q.bias) + + def forward(self, x, c, attn_mask=None): + q = self.conv_q(x) + k = self.conv_k(c) + v = self.conv_v(c) + + x, self.attn = self.attention(q, k, v, mask=attn_mask) + + x = self.conv_o(x) + return x + + def attention(self, query, key, value, mask=None): + # reshape [b, d, t] -> [b, n_h, t, d_k] + b, d, t_s, t_t = (*key.size(), query.size(2)) + query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3) + key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3) + + scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1)) + if self.window_size is not None: + assert ( + t_s == t_t + ), "Relative attention is only available for self-attention." + key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s) + rel_logits = self._matmul_with_relative_keys( + query / math.sqrt(self.k_channels), key_relative_embeddings + ) + scores_local = self._relative_position_to_absolute_position(rel_logits) + scores = scores + scores_local + if self.proximal_bias: + assert t_s == t_t, "Proximal bias is only available for self-attention." + scores = scores + self._attention_bias_proximal(t_s).to( + device=scores.device, dtype=scores.dtype + ) + if mask is not None: + scores = scores.masked_fill(mask == 0, -1e4) + if self.block_length is not None: + assert ( + t_s == t_t + ), "Local attention is only available for self-attention." + block_mask = ( + torch.ones_like(scores) + .triu(-self.block_length) + .tril(self.block_length) + ) + scores = scores.masked_fill(block_mask == 0, -1e4) + p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s] + p_attn = self.drop(p_attn) + output = torch.matmul(p_attn, value) + if self.window_size is not None: + relative_weights = self._absolute_position_to_relative_position(p_attn) + value_relative_embeddings = self._get_relative_embeddings( + self.emb_rel_v, t_s + ) + output = output + self._matmul_with_relative_values( + relative_weights, value_relative_embeddings + ) + output = ( + output.transpose(2, 3).contiguous().view(b, d, t_t) + ) # [b, n_h, t_t, d_k] -> [b, d, t_t] + return output, p_attn + + def _matmul_with_relative_values(self, x, y): + """ + x: [b, h, l, m] + y: [h or 1, m, d] + ret: [b, h, l, d] + """ + ret = torch.matmul(x, y.unsqueeze(0)) + return ret + + def _matmul_with_relative_keys(self, x, y): + """ + x: [b, h, l, d] + y: [h or 1, m, d] + ret: [b, h, l, m] + """ + ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1)) + return ret + + def _get_relative_embeddings(self, relative_embeddings, length): + max_relative_position = 2 * self.window_size + 1 + # Pad first before slice to avoid using cond ops. + pad_length = max(length - (self.window_size + 1), 0) + slice_start_position = max((self.window_size + 1) - length, 0) + slice_end_position = slice_start_position + 2 * length - 1 + if pad_length > 0: + padded_relative_embeddings = F.pad( + relative_embeddings, + commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]), + ) + else: + padded_relative_embeddings = relative_embeddings + used_relative_embeddings = padded_relative_embeddings[ + :, slice_start_position:slice_end_position + ] + return used_relative_embeddings + + def _relative_position_to_absolute_position(self, x): + """ + x: [b, h, l, 2*l-1] + ret: [b, h, l, l] + """ + batch, heads, length, _ = x.size() + # Concat columns of pad to shift from relative to absolute indexing. + x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]])) + + # Concat extra elements so to add up to shape (len+1, 2*len-1). + x_flat = x.view([batch, heads, length * 2 * length]) + x_flat = F.pad( + x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]]) + ) + + # Reshape and slice out the padded elements. + x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[ + :, :, :length, length - 1 : + ] + return x_final + + def _absolute_position_to_relative_position(self, x): + """ + x: [b, h, l, l] + ret: [b, h, l, 2*l-1] + """ + batch, heads, length, _ = x.size() + # padd along column + x = F.pad( + x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]]) + ) + x_flat = x.view([batch, heads, length**2 + length * (length - 1)]) + # add 0's in the beginning that will skew the elements after reshape + x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]])) + x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:] + return x_final + + def _attention_bias_proximal(self, length): + """Bias for self-attention to encourage attention to close positions. + Args: + length: an integer scalar. + Returns: + a Tensor with shape [1, 1, length, length] + """ + r = torch.arange(length, dtype=torch.float32) + diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1) + return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0) + + +class FFN(nn.Module): + def __init__( + self, + in_channels, + out_channels, + filter_channels, + kernel_size, + p_dropout=0.0, + activation=None, + causal=False, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.activation = activation + self.causal = causal + + if causal: + self.padding = self._causal_padding + else: + self.padding = self._same_padding + + self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size) + self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size) + self.drop = nn.Dropout(p_dropout) + + def forward(self, x, x_mask): + x = self.conv_1(self.padding(x * x_mask)) + if self.activation == "gelu": + x = x * torch.sigmoid(1.702 * x) + else: + x = torch.relu(x) + x = self.drop(x) + x = self.conv_2(self.padding(x * x_mask)) + return x * x_mask + + def _causal_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = self.kernel_size - 1 + pad_r = 0 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x + + def _same_padding(self, x): + if self.kernel_size == 1: + return x + pad_l = (self.kernel_size - 1) // 2 + pad_r = self.kernel_size // 2 + padding = [[0, 0], [0, 0], [pad_l, pad_r]] + x = F.pad(x, commons.convert_pad_shape(padding)) + return x diff --git a/lib/infer_pack/commons.py b/lib/infer_pack/commons.py new file mode 100644 index 0000000000000000000000000000000000000000..4937729bddb0959bfb6c1f6b9da40f971db6db3d --- /dev/null +++ b/lib/infer_pack/commons.py @@ -0,0 +1,166 @@ +import math +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + + +def init_weights(m, mean=0.0, std=0.01): + classname = m.__class__.__name__ + if classname.find("Conv") != -1: + m.weight.data.normal_(mean, std) + + +def get_padding(kernel_size, dilation=1): + return int((kernel_size * dilation - dilation) / 2) + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def kl_divergence(m_p, logs_p, m_q, logs_q): + """KL(P||Q)""" + kl = (logs_q - logs_p) - 0.5 + kl += ( + 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q) + ) + return kl + + +def rand_gumbel(shape): + """Sample from the Gumbel distribution, protect from overflows.""" + uniform_samples = torch.rand(shape) * 0.99998 + 0.00001 + return -torch.log(-torch.log(uniform_samples)) + + +def rand_gumbel_like(x): + g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device) + return g + + +def slice_segments(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, :, idx_str:idx_end] + return ret + + +def slice_segments2(x, ids_str, segment_size=4): + ret = torch.zeros_like(x[:, :segment_size]) + for i in range(x.size(0)): + idx_str = ids_str[i] + idx_end = idx_str + segment_size + ret[i] = x[i, idx_str:idx_end] + return ret + + +def rand_slice_segments(x, x_lengths=None, segment_size=4): + b, d, t = x.size() + if x_lengths is None: + x_lengths = t + ids_str_max = x_lengths - segment_size + 1 + ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long) + ret = slice_segments(x, ids_str, segment_size) + return ret, ids_str + + +def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4): + position = torch.arange(length, dtype=torch.float) + num_timescales = channels // 2 + log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / ( + num_timescales - 1 + ) + inv_timescales = min_timescale * torch.exp( + torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment + ) + scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1) + signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0) + signal = F.pad(signal, [0, 0, 0, channels % 2]) + signal = signal.view(1, channels, length) + return signal + + +def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return x + signal.to(dtype=x.dtype, device=x.device) + + +def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1): + b, channels, length = x.size() + signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale) + return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis) + + +def subsequent_mask(length): + mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0) + return mask + + +@torch.jit.script +def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): + n_channels_int = n_channels[0] + in_act = input_a + input_b + t_act = torch.tanh(in_act[:, :n_channels_int, :]) + s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) + acts = t_act * s_act + return acts + + +def convert_pad_shape(pad_shape): + l = pad_shape[::-1] + pad_shape = [item for sublist in l for item in sublist] + return pad_shape + + +def shift_1d(x): + x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1] + return x + + +def sequence_mask(length, max_length=None): + if max_length is None: + max_length = length.max() + x = torch.arange(max_length, dtype=length.dtype, device=length.device) + return x.unsqueeze(0) < length.unsqueeze(1) + + +def generate_path(duration, mask): + """ + duration: [b, 1, t_x] + mask: [b, 1, t_y, t_x] + """ + device = duration.device + + b, _, t_y, t_x = mask.shape + cum_duration = torch.cumsum(duration, -1) + + cum_duration_flat = cum_duration.view(b * t_x) + path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype) + path = path.view(b, t_x, t_y) + path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1] + path = path.unsqueeze(1).transpose(2, 3) * mask + return path + + +def clip_grad_value_(parameters, clip_value, norm_type=2): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + norm_type = float(norm_type) + if clip_value is not None: + clip_value = float(clip_value) + + total_norm = 0 + for p in parameters: + param_norm = p.grad.data.norm(norm_type) + total_norm += param_norm.item() ** norm_type + if clip_value is not None: + p.grad.data.clamp_(min=-clip_value, max=clip_value) + total_norm = total_norm ** (1.0 / norm_type) + return total_norm diff --git a/lib/infer_pack/models.py b/lib/infer_pack/models.py new file mode 100644 index 0000000000000000000000000000000000000000..6022a9950976592d06dcd8ef2329035d64f4cc3c --- /dev/null +++ b/lib/infer_pack/models.py @@ -0,0 +1,1142 @@ +import math, pdb, os +from time import time as ttime +import torch +from torch import nn +from torch.nn import functional as F +from lib.infer_pack import modules +from lib.infer_pack import attentions +from lib.infer_pack import commons +from lib.infer_pack.commons import init_weights, get_padding +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm +from lib.infer_pack.commons import init_weights +import numpy as np +from lib.infer_pack import commons + + +class TextEncoder256(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(256, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class TextEncoder768(nn.Module): + def __init__( + self, + out_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=True, + ): + super().__init__() + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.emb_phone = nn.Linear(768, hidden_channels) + self.lrelu = nn.LeakyReLU(0.1, inplace=True) + if f0 == True: + self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256 + self.encoder = attentions.Encoder( + hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, phone, pitch, lengths): + if pitch == None: + x = self.emb_phone(phone) + else: + x = self.emb_phone(phone) + self.emb_pitch(pitch) + x = x * math.sqrt(self.hidden_channels) # [b, t, h] + x = self.lrelu(x) + x = torch.transpose(x, 1, -1) # [b, h, t] + x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.encoder(x * x_mask, x_mask) + stats = self.proj(x) * x_mask + + m, logs = torch.split(stats, self.out_channels, dim=1) + return m, logs, x_mask + + +class ResidualCouplingBlock(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + n_flows=4, + gin_channels=0, + ): + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.n_flows = n_flows + self.gin_channels = gin_channels + + self.flows = nn.ModuleList() + for i in range(n_flows): + self.flows.append( + modules.ResidualCouplingLayer( + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + mean_only=True, + ) + ) + self.flows.append(modules.Flip()) + + def forward(self, x, x_mask, g=None, reverse=False): + if not reverse: + for flow in self.flows: + x, _ = flow(x, x_mask, g=g, reverse=reverse) + else: + for flow in reversed(self.flows): + x = flow(x, x_mask, g=g, reverse=reverse) + return x + + def remove_weight_norm(self): + for i in range(self.n_flows): + self.flows[i * 2].remove_weight_norm() + + +class PosteriorEncoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + ): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + + self.pre = nn.Conv1d(in_channels, hidden_channels, 1) + self.enc = modules.WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=gin_channels, + ) + self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1) + + def forward(self, x, x_lengths, g=None): + x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to( + x.dtype + ) + x = self.pre(x) * x_mask + x = self.enc(x, x_mask, g=g) + stats = self.proj(x) * x_mask + m, logs = torch.split(stats, self.out_channels, dim=1) + z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask + return z, m, logs, x_mask + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class Generator(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=0, + ): + super(Generator, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + def forward(self, x, g=None): + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +class SineGen(torch.nn.Module): + """Definition of sine generator + SineGen(samp_rate, harmonic_num = 0, + sine_amp = 0.1, noise_std = 0.003, + voiced_threshold = 0, + flag_for_pulse=False) + samp_rate: sampling rate in Hz + harmonic_num: number of harmonic overtones (default 0) + sine_amp: amplitude of sine-wavefrom (default 0.1) + noise_std: std of Gaussian noise (default 0.003) + voiced_thoreshold: F0 threshold for U/V classification (default 0) + flag_for_pulse: this SinGen is used inside PulseGen (default False) + Note: when flag_for_pulse is True, the first time step of a voiced + segment is always sin(np.pi) or cos(0) + """ + + def __init__( + self, + samp_rate, + harmonic_num=0, + sine_amp=0.1, + noise_std=0.003, + voiced_threshold=0, + flag_for_pulse=False, + ): + super(SineGen, self).__init__() + self.sine_amp = sine_amp + self.noise_std = noise_std + self.harmonic_num = harmonic_num + self.dim = self.harmonic_num + 1 + self.sampling_rate = samp_rate + self.voiced_threshold = voiced_threshold + + def _f02uv(self, f0): + # generate uv signal + uv = torch.ones_like(f0) + uv = uv * (f0 > self.voiced_threshold) + return uv + + def forward(self, f0, upp): + """sine_tensor, uv = forward(f0) + input F0: tensor(batchsize=1, length, dim=1) + f0 for unvoiced steps should be 0 + output sine_tensor: tensor(batchsize=1, length, dim) + output uv: tensor(batchsize=1, length, 1) + """ + with torch.no_grad(): + f0 = f0[:, None].transpose(1, 2) + f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device) + # fundamental component + f0_buf[:, :, 0] = f0[:, :, 0] + for idx in np.arange(self.harmonic_num): + f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * ( + idx + 2 + ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic + rad_values = (f0_buf / self.sampling_rate) % 1 ###%1 means that the product of n_har cannot be post-processed and optimized + rand_ini = torch.rand( + f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device + ) + rand_ini[:, 0] = 0 + rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini + tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1 means that the following cumsum can no longer be optimized + tmp_over_one *= upp + tmp_over_one = F.interpolate( + tmp_over_one.transpose(2, 1), + scale_factor=upp, + mode="linear", + align_corners=True, + ).transpose(2, 1) + rad_values = F.interpolate( + rad_values.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose( + 2, 1 + ) ####### + tmp_over_one %= 1 + tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0 + cumsum_shift = torch.zeros_like(rad_values) + cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0 + sine_waves = torch.sin( + torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi + ) + sine_waves = sine_waves * self.sine_amp + uv = self._f02uv(f0) + uv = F.interpolate( + uv.transpose(2, 1), scale_factor=upp, mode="nearest" + ).transpose(2, 1) + noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3 + noise = noise_amp * torch.randn_like(sine_waves) + sine_waves = sine_waves * uv + noise + return sine_waves, uv, noise + + +class SourceModuleHnNSF(torch.nn.Module): + """SourceModule for hn-nsf + SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1, + add_noise_std=0.003, voiced_threshod=0) + sampling_rate: sampling_rate in Hz + harmonic_num: number of harmonic above F0 (default: 0) + sine_amp: amplitude of sine source signal (default: 0.1) + add_noise_std: std of additive Gaussian noise (default: 0.003) + note that amplitude of noise in unvoiced is decided + by sine_amp + voiced_threshold: threhold to set U/V given F0 (default: 0) + Sine_source, noise_source = SourceModuleHnNSF(F0_sampled) + F0_sampled (batchsize, length, 1) + Sine_source (batchsize, length, 1) + noise_source (batchsize, length 1) + uv (batchsize, length, 1) + """ + + def __init__( + self, + sampling_rate, + harmonic_num=0, + sine_amp=0.1, + add_noise_std=0.003, + voiced_threshod=0, + is_half=True, + ): + super(SourceModuleHnNSF, self).__init__() + + self.sine_amp = sine_amp + self.noise_std = add_noise_std + self.is_half = is_half + # to produce sine waveforms + self.l_sin_gen = SineGen( + sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod + ) + + # to merge source harmonics into a single excitation + self.l_linear = torch.nn.Linear(harmonic_num + 1, 1) + self.l_tanh = torch.nn.Tanh() + + def forward(self, x, upp=None): + sine_wavs, uv, _ = self.l_sin_gen(x, upp) + if self.is_half: + sine_wavs = sine_wavs.half() + sine_merge = self.l_tanh(self.l_linear(sine_wavs)) + return sine_merge, None, None # noise, uv + + +class GeneratorNSF(torch.nn.Module): + def __init__( + self, + initial_channel, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels, + sr, + is_half=False, + ): + super(GeneratorNSF, self).__init__() + self.num_kernels = len(resblock_kernel_sizes) + self.num_upsamples = len(upsample_rates) + + self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates)) + self.m_source = SourceModuleHnNSF( + sampling_rate=sr, harmonic_num=0, is_half=is_half + ) + self.noise_convs = nn.ModuleList() + self.conv_pre = Conv1d( + initial_channel, upsample_initial_channel, 7, 1, padding=3 + ) + resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2 + + self.ups = nn.ModuleList() + for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)): + c_cur = upsample_initial_channel // (2 ** (i + 1)) + self.ups.append( + weight_norm( + ConvTranspose1d( + upsample_initial_channel // (2**i), + upsample_initial_channel // (2 ** (i + 1)), + k, + u, + padding=(k - u) // 2, + ) + ) + ) + if i + 1 < len(upsample_rates): + stride_f0 = np.prod(upsample_rates[i + 1 :]) + self.noise_convs.append( + Conv1d( + 1, + c_cur, + kernel_size=stride_f0 * 2, + stride=stride_f0, + padding=stride_f0 // 2, + ) + ) + else: + self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1)) + + self.resblocks = nn.ModuleList() + for i in range(len(self.ups)): + ch = upsample_initial_channel // (2 ** (i + 1)) + for j, (k, d) in enumerate( + zip(resblock_kernel_sizes, resblock_dilation_sizes) + ): + self.resblocks.append(resblock(ch, k, d)) + + self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False) + self.ups.apply(init_weights) + + if gin_channels != 0: + self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1) + + self.upp = np.prod(upsample_rates) + + def forward(self, x, f0, g=None): + har_source, noi_source, uv = self.m_source(f0, self.upp) + har_source = har_source.transpose(1, 2) + x = self.conv_pre(x) + if g is not None: + x = x + self.cond(g) + + for i in range(self.num_upsamples): + x = F.leaky_relu(x, modules.LRELU_SLOPE) + x = self.ups[i](x) + x_source = self.noise_convs[i](har_source) + x = x + x_source + xs = None + for j in range(self.num_kernels): + if xs is None: + xs = self.resblocks[i * self.num_kernels + j](x) + else: + xs += self.resblocks[i * self.num_kernels + j](x) + x = xs / self.num_kernels + x = F.leaky_relu(x) + x = self.conv_post(x) + x = torch.tanh(x) + return x + + def remove_weight_norm(self): + for l in self.ups: + remove_weight_norm(l) + for l in self.resblocks: + l.remove_weight_norm() + + +sr2sr = { + "32k": 32000, + "40k": 40000, + "48k": 48000, +} + + +class SynthesizerTrnMs256NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # Here ds is id, [bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1 is t, broadcast + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + nsff0 = nsff0[:, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr, + **kwargs + ): + super().__init__() + if type(sr) == type("strr"): + sr = sr2sr[sr] + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + ) + self.dec = GeneratorNSF( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + sr=sr, + is_half=kwargs["is_half"], + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward( + self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds + ): # Here ds is id,[bs,1] + # print(1,pitch.shape)#[bs,t] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1 is t, broadcast + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length) + pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size) + # print(-2,pitchf.shape,z_slice.shape) + o = self.dec(z_slice, pitchf, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + nsff0 = nsff0[:, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, nsff0, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs256NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder256( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # Here ds is id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1 is t, broadcast + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, rate=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class SynthesizerTrnMs768NSFsid_nono(nn.Module): + def __init__( + self, + spec_channels, + segment_size, + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + spk_embed_dim, + gin_channels, + sr=None, + **kwargs + ): + super().__init__() + self.spec_channels = spec_channels + self.inter_channels = inter_channels + self.hidden_channels = hidden_channels + self.filter_channels = filter_channels + self.n_heads = n_heads + self.n_layers = n_layers + self.kernel_size = kernel_size + self.p_dropout = p_dropout + self.resblock = resblock + self.resblock_kernel_sizes = resblock_kernel_sizes + self.resblock_dilation_sizes = resblock_dilation_sizes + self.upsample_rates = upsample_rates + self.upsample_initial_channel = upsample_initial_channel + self.upsample_kernel_sizes = upsample_kernel_sizes + self.segment_size = segment_size + self.gin_channels = gin_channels + # self.hop_length = hop_length# + self.spk_embed_dim = spk_embed_dim + self.enc_p = TextEncoder768( + inter_channels, + hidden_channels, + filter_channels, + n_heads, + n_layers, + kernel_size, + p_dropout, + f0=False, + ) + self.dec = Generator( + inter_channels, + resblock, + resblock_kernel_sizes, + resblock_dilation_sizes, + upsample_rates, + upsample_initial_channel, + upsample_kernel_sizes, + gin_channels=gin_channels, + ) + self.enc_q = PosteriorEncoder( + spec_channels, + inter_channels, + hidden_channels, + 5, + 1, + 16, + gin_channels=gin_channels, + ) + self.flow = ResidualCouplingBlock( + inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels + ) + self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels) + print("gin_channels:", gin_channels, "self.spk_embed_dim:", self.spk_embed_dim) + + def remove_weight_norm(self): + self.dec.remove_weight_norm() + self.flow.remove_weight_norm() + self.enc_q.remove_weight_norm() + + def forward(self, phone, phone_lengths, y, y_lengths, ds): # Here ds is id,[bs,1] + g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1 is t, broadcast + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g) + z_p = self.flow(z, y_mask, g=g) + z_slice, ids_slice = commons.rand_slice_segments( + z, y_lengths, self.segment_size + ) + o = self.dec(z_slice, g=g) + return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q) + + def infer(self, phone, phone_lengths, sid, rate=None): + g = self.emb_g(sid).unsqueeze(-1) + m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths) + z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask + if rate: + head = int(z_p.shape[2] * rate) + z_p = z_p[:, :, -head:] + x_mask = x_mask[:, :, -head:] + z = self.flow(z_p, x_mask, g=g, reverse=True) + o = self.dec(z * x_mask, g=g) + return o, x_mask, (z, z_p, m_p, logs_p) + + +class MultiPeriodDiscriminator(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminator, self).__init__() + periods = [2, 3, 5, 7, 11, 17] + # periods = [3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class MultiPeriodDiscriminatorV2(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(MultiPeriodDiscriminatorV2, self).__init__() + # periods = [2, 3, 5, 7, 11, 17] + periods = [2, 3, 5, 7, 11, 17, 23, 37] + + discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)] + discs = discs + [ + DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods + ] + self.discriminators = nn.ModuleList(discs) + + def forward(self, y, y_hat): + y_d_rs = [] # + y_d_gs = [] + fmap_rs = [] + fmap_gs = [] + for i, d in enumerate(self.discriminators): + y_d_r, fmap_r = d(y) + y_d_g, fmap_g = d(y_hat) + # for j in range(len(fmap_r)): + # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape) + y_d_rs.append(y_d_r) + y_d_gs.append(y_d_g) + fmap_rs.append(fmap_r) + fmap_gs.append(fmap_g) + + return y_d_rs, y_d_gs, fmap_rs, fmap_gs + + +class DiscriminatorS(torch.nn.Module): + def __init__(self, use_spectral_norm=False): + super(DiscriminatorS, self).__init__() + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f(Conv1d(1, 16, 15, 1, padding=7)), + norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)), + norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)), + norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)), + norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)), + norm_f(Conv1d(1024, 1024, 5, 1, padding=2)), + ] + ) + self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1)) + + def forward(self, x): + fmap = [] + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap + + +class DiscriminatorP(torch.nn.Module): + def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False): + super(DiscriminatorP, self).__init__() + self.period = period + self.use_spectral_norm = use_spectral_norm + norm_f = weight_norm if use_spectral_norm == False else spectral_norm + self.convs = nn.ModuleList( + [ + norm_f( + Conv2d( + 1, + 32, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 32, + 128, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 128, + 512, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 512, + 1024, + (kernel_size, 1), + (stride, 1), + padding=(get_padding(kernel_size, 1), 0), + ) + ), + norm_f( + Conv2d( + 1024, + 1024, + (kernel_size, 1), + 1, + padding=(get_padding(kernel_size, 1), 0), + ) + ), + ] + ) + self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0))) + + def forward(self, x): + fmap = [] + + # 1d to 2d + b, c, t = x.shape + if t % self.period != 0: # pad first + n_pad = self.period - (t % self.period) + x = F.pad(x, (0, n_pad), "reflect") + t = t + n_pad + x = x.view(b, c, t // self.period, self.period) + + for l in self.convs: + x = l(x) + x = F.leaky_relu(x, modules.LRELU_SLOPE) + fmap.append(x) + x = self.conv_post(x) + fmap.append(x) + x = torch.flatten(x, 1, -1) + + return x, fmap diff --git a/lib/infer_pack/modules.py b/lib/infer_pack/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..b54dc47777363151f1f09deb1284e13e106b9fd1 --- /dev/null +++ b/lib/infer_pack/modules.py @@ -0,0 +1,522 @@ +import copy +import math +import numpy as np +import scipy +import torch +from torch import nn +from torch.nn import functional as F + +from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d +from torch.nn.utils import weight_norm, remove_weight_norm + +from lib.infer_pack import commons +from lib.infer_pack.commons import init_weights, get_padding +from lib.infer_pack.transforms import piecewise_rational_quadratic_transform + + +LRELU_SLOPE = 0.1 + + +class LayerNorm(nn.Module): + def __init__(self, channels, eps=1e-5): + super().__init__() + self.channels = channels + self.eps = eps + + self.gamma = nn.Parameter(torch.ones(channels)) + self.beta = nn.Parameter(torch.zeros(channels)) + + def forward(self, x): + x = x.transpose(1, -1) + x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps) + return x.transpose(1, -1) + + +class ConvReluNorm(nn.Module): + def __init__( + self, + in_channels, + hidden_channels, + out_channels, + kernel_size, + n_layers, + p_dropout, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_channels = hidden_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + assert n_layers > 1, "Number of layers should be larger than 0." + + self.conv_layers = nn.ModuleList() + self.norm_layers = nn.ModuleList() + self.conv_layers.append( + nn.Conv1d( + in_channels, hidden_channels, kernel_size, padding=kernel_size // 2 + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout)) + for _ in range(n_layers - 1): + self.conv_layers.append( + nn.Conv1d( + hidden_channels, + hidden_channels, + kernel_size, + padding=kernel_size // 2, + ) + ) + self.norm_layers.append(LayerNorm(hidden_channels)) + self.proj = nn.Conv1d(hidden_channels, out_channels, 1) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask): + x_org = x + for i in range(self.n_layers): + x = self.conv_layers[i](x * x_mask) + x = self.norm_layers[i](x) + x = self.relu_drop(x) + x = x_org + self.proj(x) + return x * x_mask + + +class DDSConv(nn.Module): + """ + Dialted and Depth-Separable Convolution + """ + + def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0): + super().__init__() + self.channels = channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.p_dropout = p_dropout + + self.drop = nn.Dropout(p_dropout) + self.convs_sep = nn.ModuleList() + self.convs_1x1 = nn.ModuleList() + self.norms_1 = nn.ModuleList() + self.norms_2 = nn.ModuleList() + for i in range(n_layers): + dilation = kernel_size**i + padding = (kernel_size * dilation - dilation) // 2 + self.convs_sep.append( + nn.Conv1d( + channels, + channels, + kernel_size, + groups=channels, + dilation=dilation, + padding=padding, + ) + ) + self.convs_1x1.append(nn.Conv1d(channels, channels, 1)) + self.norms_1.append(LayerNorm(channels)) + self.norms_2.append(LayerNorm(channels)) + + def forward(self, x, x_mask, g=None): + if g is not None: + x = x + g + for i in range(self.n_layers): + y = self.convs_sep[i](x * x_mask) + y = self.norms_1[i](y) + y = F.gelu(y) + y = self.convs_1x1[i](y) + y = self.norms_2[i](y) + y = F.gelu(y) + y = self.drop(y) + x = x + y + return x * x_mask + + +class WN(torch.nn.Module): + def __init__( + self, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + gin_channels=0, + p_dropout=0, + ): + super(WN, self).__init__() + assert kernel_size % 2 == 1 + self.hidden_channels = hidden_channels + self.kernel_size = (kernel_size,) + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.gin_channels = gin_channels + self.p_dropout = p_dropout + + self.in_layers = torch.nn.ModuleList() + self.res_skip_layers = torch.nn.ModuleList() + self.drop = nn.Dropout(p_dropout) + + if gin_channels != 0: + cond_layer = torch.nn.Conv1d( + gin_channels, 2 * hidden_channels * n_layers, 1 + ) + self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight") + + for i in range(n_layers): + dilation = dilation_rate**i + padding = int((kernel_size * dilation - dilation) / 2) + in_layer = torch.nn.Conv1d( + hidden_channels, + 2 * hidden_channels, + kernel_size, + dilation=dilation, + padding=padding, + ) + in_layer = torch.nn.utils.weight_norm(in_layer, name="weight") + self.in_layers.append(in_layer) + + # last one is not necessary + if i < n_layers - 1: + res_skip_channels = 2 * hidden_channels + else: + res_skip_channels = hidden_channels + + res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1) + res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight") + self.res_skip_layers.append(res_skip_layer) + + def forward(self, x, x_mask, g=None, **kwargs): + output = torch.zeros_like(x) + n_channels_tensor = torch.IntTensor([self.hidden_channels]) + + if g is not None: + g = self.cond_layer(g) + + for i in range(self.n_layers): + x_in = self.in_layers[i](x) + if g is not None: + cond_offset = i * 2 * self.hidden_channels + g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :] + else: + g_l = torch.zeros_like(x_in) + + acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor) + acts = self.drop(acts) + + res_skip_acts = self.res_skip_layers[i](acts) + if i < self.n_layers - 1: + res_acts = res_skip_acts[:, : self.hidden_channels, :] + x = (x + res_acts) * x_mask + output = output + res_skip_acts[:, self.hidden_channels :, :] + else: + output = output + res_skip_acts + return output * x_mask + + def remove_weight_norm(self): + if self.gin_channels != 0: + torch.nn.utils.remove_weight_norm(self.cond_layer) + for l in self.in_layers: + torch.nn.utils.remove_weight_norm(l) + for l in self.res_skip_layers: + torch.nn.utils.remove_weight_norm(l) + + +class ResBlock1(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): + super(ResBlock1, self).__init__() + self.convs1 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[2], + padding=get_padding(kernel_size, dilation[2]), + ) + ), + ] + ) + self.convs1.apply(init_weights) + + self.convs2 = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=1, + padding=get_padding(kernel_size, 1), + ) + ), + ] + ) + self.convs2.apply(init_weights) + + def forward(self, x, x_mask=None): + for c1, c2 in zip(self.convs1, self.convs2): + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c1(xt) + xt = F.leaky_relu(xt, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c2(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs1: + remove_weight_norm(l) + for l in self.convs2: + remove_weight_norm(l) + + +class ResBlock2(torch.nn.Module): + def __init__(self, channels, kernel_size=3, dilation=(1, 3)): + super(ResBlock2, self).__init__() + self.convs = nn.ModuleList( + [ + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[0], + padding=get_padding(kernel_size, dilation[0]), + ) + ), + weight_norm( + Conv1d( + channels, + channels, + kernel_size, + 1, + dilation=dilation[1], + padding=get_padding(kernel_size, dilation[1]), + ) + ), + ] + ) + self.convs.apply(init_weights) + + def forward(self, x, x_mask=None): + for c in self.convs: + xt = F.leaky_relu(x, LRELU_SLOPE) + if x_mask is not None: + xt = xt * x_mask + xt = c(xt) + x = xt + x + if x_mask is not None: + x = x * x_mask + return x + + def remove_weight_norm(self): + for l in self.convs: + remove_weight_norm(l) + + +class Log(nn.Module): + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask + logdet = torch.sum(-y, [1, 2]) + return y, logdet + else: + x = torch.exp(x) * x_mask + return x + + +class Flip(nn.Module): + def forward(self, x, *args, reverse=False, **kwargs): + x = torch.flip(x, [1]) + if not reverse: + logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device) + return x, logdet + else: + return x + + +class ElementwiseAffine(nn.Module): + def __init__(self, channels): + super().__init__() + self.channels = channels + self.m = nn.Parameter(torch.zeros(channels, 1)) + self.logs = nn.Parameter(torch.zeros(channels, 1)) + + def forward(self, x, x_mask, reverse=False, **kwargs): + if not reverse: + y = self.m + torch.exp(self.logs) * x + y = y * x_mask + logdet = torch.sum(self.logs * x_mask, [1, 2]) + return y, logdet + else: + x = (x - self.m) * torch.exp(-self.logs) * x_mask + return x + + +class ResidualCouplingLayer(nn.Module): + def __init__( + self, + channels, + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=0, + gin_channels=0, + mean_only=False, + ): + assert channels % 2 == 0, "channels should be divisible by 2" + super().__init__() + self.channels = channels + self.hidden_channels = hidden_channels + self.kernel_size = kernel_size + self.dilation_rate = dilation_rate + self.n_layers = n_layers + self.half_channels = channels // 2 + self.mean_only = mean_only + + self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1) + self.enc = WN( + hidden_channels, + kernel_size, + dilation_rate, + n_layers, + p_dropout=p_dropout, + gin_channels=gin_channels, + ) + self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1) + self.post.weight.data.zero_() + self.post.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) * x_mask + h = self.enc(h, x_mask, g=g) + stats = self.post(h) * x_mask + if not self.mean_only: + m, logs = torch.split(stats, [self.half_channels] * 2, 1) + else: + m = stats + logs = torch.zeros_like(m) + + if not reverse: + x1 = m + x1 * torch.exp(logs) * x_mask + x = torch.cat([x0, x1], 1) + logdet = torch.sum(logs, [1, 2]) + return x, logdet + else: + x1 = (x1 - m) * torch.exp(-logs) * x_mask + x = torch.cat([x0, x1], 1) + return x + + def remove_weight_norm(self): + self.enc.remove_weight_norm() + + +class ConvFlow(nn.Module): + def __init__( + self, + in_channels, + filter_channels, + kernel_size, + n_layers, + num_bins=10, + tail_bound=5.0, + ): + super().__init__() + self.in_channels = in_channels + self.filter_channels = filter_channels + self.kernel_size = kernel_size + self.n_layers = n_layers + self.num_bins = num_bins + self.tail_bound = tail_bound + self.half_channels = in_channels // 2 + + self.pre = nn.Conv1d(self.half_channels, filter_channels, 1) + self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0) + self.proj = nn.Conv1d( + filter_channels, self.half_channels * (num_bins * 3 - 1), 1 + ) + self.proj.weight.data.zero_() + self.proj.bias.data.zero_() + + def forward(self, x, x_mask, g=None, reverse=False): + x0, x1 = torch.split(x, [self.half_channels] * 2, 1) + h = self.pre(x0) + h = self.convs(h, x_mask, g=g) + h = self.proj(h) * x_mask + + b, c, t = x0.shape + h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?] + + unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels) + unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt( + self.filter_channels + ) + unnormalized_derivatives = h[..., 2 * self.num_bins :] + + x1, logabsdet = piecewise_rational_quadratic_transform( + x1, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=reverse, + tails="linear", + tail_bound=self.tail_bound, + ) + + x = torch.cat([x0, x1], 1) * x_mask + logdet = torch.sum(logabsdet * x_mask, [1, 2]) + if not reverse: + return x, logdet + else: + return x diff --git a/lib/infer_pack/transforms.py b/lib/infer_pack/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..7d93c482d296f8bafd9cb5f2a6b1d7c7608cd17c --- /dev/null +++ b/lib/infer_pack/transforms.py @@ -0,0 +1,209 @@ +import torch +from torch.nn import functional as F + +import numpy as np + + +DEFAULT_MIN_BIN_WIDTH = 1e-3 +DEFAULT_MIN_BIN_HEIGHT = 1e-3 +DEFAULT_MIN_DERIVATIVE = 1e-3 + + +def piecewise_rational_quadratic_transform( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails=None, + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if tails is None: + spline_fn = rational_quadratic_spline + spline_kwargs = {} + else: + spline_fn = unconstrained_rational_quadratic_spline + spline_kwargs = {"tails": tails, "tail_bound": tail_bound} + + outputs, logabsdet = spline_fn( + inputs=inputs, + unnormalized_widths=unnormalized_widths, + unnormalized_heights=unnormalized_heights, + unnormalized_derivatives=unnormalized_derivatives, + inverse=inverse, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + **spline_kwargs + ) + return outputs, logabsdet + + +def searchsorted(bin_locations, inputs, eps=1e-6): + bin_locations[..., -1] += eps + return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1 + + +def unconstrained_rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + tails="linear", + tail_bound=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound) + outside_interval_mask = ~inside_interval_mask + + outputs = torch.zeros_like(inputs) + logabsdet = torch.zeros_like(inputs) + + if tails == "linear": + unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1)) + constant = np.log(np.exp(1 - min_derivative) - 1) + unnormalized_derivatives[..., 0] = constant + unnormalized_derivatives[..., -1] = constant + + outputs[outside_interval_mask] = inputs[outside_interval_mask] + logabsdet[outside_interval_mask] = 0 + else: + raise RuntimeError("{} tails are not implemented.".format(tails)) + + ( + outputs[inside_interval_mask], + logabsdet[inside_interval_mask], + ) = rational_quadratic_spline( + inputs=inputs[inside_interval_mask], + unnormalized_widths=unnormalized_widths[inside_interval_mask, :], + unnormalized_heights=unnormalized_heights[inside_interval_mask, :], + unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :], + inverse=inverse, + left=-tail_bound, + right=tail_bound, + bottom=-tail_bound, + top=tail_bound, + min_bin_width=min_bin_width, + min_bin_height=min_bin_height, + min_derivative=min_derivative, + ) + + return outputs, logabsdet + + +def rational_quadratic_spline( + inputs, + unnormalized_widths, + unnormalized_heights, + unnormalized_derivatives, + inverse=False, + left=0.0, + right=1.0, + bottom=0.0, + top=1.0, + min_bin_width=DEFAULT_MIN_BIN_WIDTH, + min_bin_height=DEFAULT_MIN_BIN_HEIGHT, + min_derivative=DEFAULT_MIN_DERIVATIVE, +): + if torch.min(inputs) < left or torch.max(inputs) > right: + raise ValueError("Input to a transform is not within its domain") + + num_bins = unnormalized_widths.shape[-1] + + if min_bin_width * num_bins > 1.0: + raise ValueError("Minimal bin width too large for the number of bins") + if min_bin_height * num_bins > 1.0: + raise ValueError("Minimal bin height too large for the number of bins") + + widths = F.softmax(unnormalized_widths, dim=-1) + widths = min_bin_width + (1 - min_bin_width * num_bins) * widths + cumwidths = torch.cumsum(widths, dim=-1) + cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0) + cumwidths = (right - left) * cumwidths + left + cumwidths[..., 0] = left + cumwidths[..., -1] = right + widths = cumwidths[..., 1:] - cumwidths[..., :-1] + + derivatives = min_derivative + F.softplus(unnormalized_derivatives) + + heights = F.softmax(unnormalized_heights, dim=-1) + heights = min_bin_height + (1 - min_bin_height * num_bins) * heights + cumheights = torch.cumsum(heights, dim=-1) + cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0) + cumheights = (top - bottom) * cumheights + bottom + cumheights[..., 0] = bottom + cumheights[..., -1] = top + heights = cumheights[..., 1:] - cumheights[..., :-1] + + if inverse: + bin_idx = searchsorted(cumheights, inputs)[..., None] + else: + bin_idx = searchsorted(cumwidths, inputs)[..., None] + + input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0] + input_bin_widths = widths.gather(-1, bin_idx)[..., 0] + + input_cumheights = cumheights.gather(-1, bin_idx)[..., 0] + delta = heights / widths + input_delta = delta.gather(-1, bin_idx)[..., 0] + + input_derivatives = derivatives.gather(-1, bin_idx)[..., 0] + input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0] + + input_heights = heights.gather(-1, bin_idx)[..., 0] + + if inverse: + a = (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + input_heights * (input_delta - input_derivatives) + b = input_heights * input_derivatives - (inputs - input_cumheights) * ( + input_derivatives + input_derivatives_plus_one - 2 * input_delta + ) + c = -input_delta * (inputs - input_cumheights) + + discriminant = b.pow(2) - 4 * a * c + assert (discriminant >= 0).all() + + root = (2 * c) / (-b - torch.sqrt(discriminant)) + outputs = root * input_bin_widths + input_cumwidths + + theta_one_minus_theta = root * (1 - root) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * root.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - root).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, -logabsdet + else: + theta = (inputs - input_cumwidths) / input_bin_widths + theta_one_minus_theta = theta * (1 - theta) + + numerator = input_heights * ( + input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta + ) + denominator = input_delta + ( + (input_derivatives + input_derivatives_plus_one - 2 * input_delta) + * theta_one_minus_theta + ) + outputs = input_cumheights + numerator / denominator + + derivative_numerator = input_delta.pow(2) * ( + input_derivatives_plus_one * theta.pow(2) + + 2 * input_delta * theta_one_minus_theta + + input_derivatives * (1 - theta).pow(2) + ) + logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator) + + return outputs, logabsdet diff --git a/lib/rmvpe.py b/lib/rmvpe.py new file mode 100644 index 0000000000000000000000000000000000000000..38e8bc44e0e769ffbaa25d239345a0352c1895de --- /dev/null +++ b/lib/rmvpe.py @@ -0,0 +1,422 @@ +import torch, numpy as np +import torch.nn as nn +import torch.nn.functional as F + + + +class BiGRU(nn.Module): + def __init__(self, input_features, hidden_features, num_layers): + super(BiGRU, self).__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x): + return self.gru(x)[0] + + +class ConvBlockRes(nn.Module): + def __init__(self, in_channels, out_channels, momentum=0.01): + super(ConvBlockRes, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + if in_channels != out_channels: + self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) + self.is_shortcut = True + else: + self.is_shortcut = False + + def forward(self, x): + if self.is_shortcut: + return self.conv(x) + self.shortcut(x) + else: + return self.conv(x) + x + + +class Encoder(nn.Module): + def __init__( + self, + in_channels, + in_size, + n_encoders, + kernel_size, + n_blocks, + out_channels=16, + momentum=0.01, + ): + super(Encoder, self).__init__() + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + for i in range(self.n_encoders): + self.layers.append( + ResEncoderBlock( + in_channels, out_channels, kernel_size, n_blocks, momentum=momentum + ) + ) + self.latent_channels.append([out_channels, in_size]) + in_channels = out_channels + out_channels *= 2 + in_size //= 2 + self.out_size = in_size + self.out_channel = out_channels + + def forward(self, x): + concat_tensors = [] + x = self.bn(x) + for i in range(self.n_encoders): + _, x = self.layers[i](x) + concat_tensors.append(_) + return x, concat_tensors + + +class ResEncoderBlock(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 + ): + super(ResEncoderBlock, self).__init__() + self.n_blocks = n_blocks + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.kernel_size = kernel_size + if self.kernel_size is not None: + self.pool = nn.AvgPool2d(kernel_size=kernel_size) + + def forward(self, x): + for i in range(self.n_blocks): + x = self.conv[i](x) + if self.kernel_size is not None: + return x, self.pool(x) + else: + return x + + +class Intermediate(nn.Module): # + def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): + super(Intermediate, self).__init__() + self.n_inters = n_inters + self.layers = nn.ModuleList() + self.layers.append( + ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) + ) + for i in range(self.n_inters - 1): + self.layers.append( + ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) + ) + + def forward(self, x): + for i in range(self.n_inters): + x = self.layers[i](x) + return x + + +class ResDecoderBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): + super(ResDecoderBlock, self).__init__() + out_padding = (0, 1) if stride == (1, 2) else (1, 1) + self.n_blocks = n_blocks + self.conv1 = nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=stride, + padding=(1, 1), + output_padding=out_padding, + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + x = torch.cat((x, concat_tensor), dim=1) + for i in range(self.n_blocks): + x = self.conv2[i](x) + return x + + +class Decoder(nn.Module): + def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): + super(Decoder, self).__init__() + self.layers = nn.ModuleList() + self.n_decoders = n_decoders + for i in range(self.n_decoders): + out_channels = in_channels // 2 + self.layers.append( + ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) + ) + in_channels = out_channels + + def forward(self, x, concat_tensors): + for i in range(self.n_decoders): + x = self.layers[i](x, concat_tensors[-1 - i]) + return x + + +class DeepUnet(nn.Module): + def __init__( + self, + kernel_size, + n_blocks, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(DeepUnet, self).__init__() + self.encoder = Encoder( + in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels + ) + self.intermediate = Intermediate( + self.encoder.out_channel // 2, + self.encoder.out_channel, + inter_layers, + n_blocks, + ) + self.decoder = Decoder( + self.encoder.out_channel, en_de_layers, kernel_size, n_blocks + ) + + def forward(self, x): + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +class E2E(nn.Module): + def __init__( + self, + n_blocks, + n_gru, + kernel_size, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(E2E, self).__init__() + self.unet = DeepUnet( + kernel_size, + n_blocks, + en_de_layers, + inter_layers, + in_channels, + en_out_channels, + ) + self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, 360), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid() + ) + + def forward(self, mel): + mel = mel.transpose(-1, -2).unsqueeze(1) + x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) + x = self.fc(x) + return x + + +from librosa.filters import mel + + +class MelSpectrogram(torch.nn.Module): + def __init__( + self, + is_half, + n_mel_channels, + sampling_rate, + win_length, + hop_length, + n_fft=None, + mel_fmin=0, + mel_fmax=None, + clamp=1e-5, + ): + super().__init__() + n_fft = win_length if n_fft is None else n_fft + self.hann_window = {} + mel_basis = mel( + sr=sampling_rate, + n_fft=n_fft, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax, + htk=True, + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + self.n_fft = win_length if n_fft is None else n_fft + self.hop_length = hop_length + self.win_length = win_length + self.sampling_rate = sampling_rate + self.n_mel_channels = n_mel_channels + self.clamp = clamp + self.is_half = is_half + + def forward(self, audio, keyshift=0, speed=1, center=True): + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(self.n_fft * factor)) + win_length_new = int(np.round(self.win_length * factor)) + hop_length_new = int(np.round(self.hop_length * speed)) + keyshift_key = str(keyshift) + "_" + str(audio.device) + if keyshift_key not in self.hann_window: + self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( + audio.device + ) + fft = torch.stft( + audio, + n_fft=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window=self.hann_window[keyshift_key], + center=center, + return_complex=True, + ) + magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) + if keyshift != 0: + size = self.n_fft // 2 + 1 + resize = magnitude.size(1) + if resize < size: + magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) + magnitude = magnitude[:, :size, :] * self.win_length / win_length_new + mel_output = torch.matmul(self.mel_basis, magnitude) + if self.is_half == True: + mel_output = mel_output.half() + log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) + return log_mel_spec + + +class RMVPE: + def __init__(self, model_path, is_half, device=None): + self.resample_kernel = {} + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu") + model.load_state_dict(ckpt) + model.eval() + if is_half == True: + model = model.half() + self.model = model + self.resample_kernel = {} + self.is_half = is_half + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + self.mel_extractor = MelSpectrogram( + is_half, 128, 16000, 1024, 160, None, 30, 8000 + ).to(device) + self.model = self.model.to(device) + cents_mapping = 20 * np.arange(360) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 + + def mel2hidden(self, mel): + with torch.no_grad(): + n_frames = mel.shape[-1] + mel = F.pad( + mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" + ) + hidden = self.model(mel) + return hidden[:, :n_frames] + + def decode(self, hidden, thred=0.03): + cents_pred = self.to_local_average_cents(hidden, thred=thred) + f0 = 10 * (2 ** (cents_pred / 1200)) + f0[f0 == 10] = 0 + # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) + return f0 + + def infer_from_audio(self, audio, thred=0.03): + audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) + # torch.cuda.synchronize() + # t0=ttime() + mel = self.mel_extractor(audio, center=True) + # torch.cuda.synchronize() + # t1=ttime() + hidden = self.mel2hidden(mel) + # torch.cuda.synchronize() + # t2=ttime() + hidden = hidden.squeeze(0).cpu().numpy() + if self.is_half == True: + hidden = hidden.astype("float32") + f0 = self.decode(hidden, thred=thred) + # torch.cuda.synchronize() + # t3=ttime() + # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) + return f0 + + def pitch_based_audio_inference(self, audio, thred=0.03, f0_min=50, f0_max=1100): + audio = torch.from_numpy(audio).float().to(self.device).unsqueeze(0) + mel = self.mel_extractor(audio, center=True) + hidden = self.mel2hidden(mel) + hidden = hidden.squeeze(0).cpu().numpy() + if self.is_half == True: + hidden = hidden.astype("float32") + f0 = self.decode(hidden, thred=thred) + f0[(f0 < f0_min) | (f0 > f0_max)] = 0 + return f0 + + def to_local_average_cents(self, salience, thred=0.05): + # t0 = ttime() + center = np.argmax(salience, axis=1) # frame length#index + salience = np.pad(salience, ((0, 0), (4, 4))) # frame length,368 + # t1 = ttime() + center += 4 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + for idx in range(salience.shape[0]): + todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) + todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) + # t2 = ttime() + todo_salience = np.array(todo_salience) # frame length,9 + todo_cents_mapping = np.array(todo_cents_mapping) # frame length,9 + product_sum = np.sum(todo_salience * todo_cents_mapping, 1) + weight_sum = np.sum(todo_salience, 1) # frame length + devided = product_sum / weight_sum # frame length + # t3 = ttime() + maxx = np.max(salience, axis=1) # frame length + devided[maxx <= thred] = 0 + # t4 = ttime() + # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + return devided diff --git a/mdx_models/data.json b/mdx_models/data.json new file mode 100644 index 0000000000000000000000000000000000000000..fb15cb3ffbb86226b5ecbc96b8ed2d651e834bb7 --- /dev/null +++ b/mdx_models/data.json @@ -0,0 +1,354 @@ +{ + "0ddfc0eb5792638ad5dc27850236c246": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "26d308f91f3423a67dc69a6d12a8793d": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 8192, + "primary_stem": "Other" + }, + "2cdd429caac38f0194b133884160f2c6": { + "compensate": 1.045, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "2f5501189a2f6db6349916fabe8c90de": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "398580b6d5d973af3120df54cee6759d": { + "compensate": 1.75, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "488b3e6f8bd3717d9d7c428476be2d75": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "4910e7827f335048bdac11fa967772f9": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 7, + "mdx_n_fft_scale_set": 4096, + "primary_stem": "Drums" + }, + "53c4baf4d12c3e6c3831bb8f5b532b93": { + "compensate": 1.043, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "5d343409ef0df48c7d78cce9f0106781": { + "compensate": 1.075, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "5f6483271e1efb9bfb59e4a3e6d4d098": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "65ab5919372a128e4167f5e01a8fda85": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 8192, + "primary_stem": "Other" + }, + "6703e39f36f18aa7855ee1047765621d": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 16384, + "primary_stem": "Bass" + }, + "6b31de20e84392859a3d09d43f089515": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "867595e9de46f6ab699008295df62798": { + "compensate": 1.03, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "a3cd63058945e777505c01d2507daf37": { + "compensate": 1.03, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "b33d9b3950b6cbf5fe90a32608924700": { + "compensate": 1.03, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "c3b29bdce8c4fa17ec609e16220330ab": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 16384, + "primary_stem": "Bass" + }, + "ceed671467c1f64ebdfac8a2490d0d52": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "d2a1376f310e4f7fa37fb9b5774eb701": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "d7bff498db9324db933d913388cba6be": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "d94058f8c7f1fae4164868ae8ae66b20": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Vocals" + }, + "dc41ede5961d50f277eb846db17f5319": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 4096, + "primary_stem": "Drums" + }, + "e5572e58abf111f80d8241d2e44e7fa4": { + "compensate": 1.028, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "e7324c873b1f615c35c1967f912db92a": { + "compensate": 1.03, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "1c56ec0224f1d559c42fd6fd2a67b154": { + "compensate": 1.025, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental" + }, + "f2df6d6863d8f435436d8b561594ff49": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "b06327a00d5e5fbc7d96e1781bbdb596": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "94ff780b977d3ca07c7a343dab2e25dd": { + "compensate": 1.039, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "73492b58195c3b52d34590d5474452f6": { + "compensate": 1.043, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "970b3f9492014d18fefeedfe4773cb42": { + "compensate": 1.009, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "1d64a6d2c30f709b8c9b4ce1366d96ee": { + "compensate": 1.035, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental" + }, + "203f2a3955221b64df85a41af87cf8f0": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "291c2049608edb52648b96e27eb80e95": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "ead8d05dab12ec571d67549b3aab03fc": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "cc63408db3d80b4d85b0287d1d7c9632": { + "compensate": 1.033, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "cd5b2989ad863f116c855db1dfe24e39": { + "compensate": 1.035, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 9, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Other" + }, + "55657dd70583b0fedfba5f67df11d711": { + "compensate": 1.022, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 6144, + "primary_stem": "Instrumental" + }, + "b6bccda408a436db8500083ef3491e8b": { + "compensate": 1.02, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "8a88db95c7fb5dbe6a095ff2ffb428b1": { + "compensate": 1.026, + "mdx_dim_f_set": 2048, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental" + }, + "b78da4afc6512f98e4756f5977f5c6b9": { + "compensate": 1.021, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Instrumental" + }, + "77d07b2667ddf05b9e3175941b4454a0": { + "compensate": 1.021, + "mdx_dim_f_set": 3072, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 7680, + "primary_stem": "Vocals" + }, + "0f2a6bc5b49d87d64728ee40e23bceb1": { + "compensate": 1.019, + "mdx_dim_f_set": 2560, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "Instrumental" + }, + "b02be2d198d4968a121030cf8950b492": { + "compensate": 1.020, + "mdx_dim_f_set": 2560, + "mdx_dim_t_set": 8, + "mdx_n_fft_scale_set": 5120, + "primary_stem": "No Crowd" + }, + "2154254ee89b2945b97a7efed6e88820": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "063aadd735d58150722926dcbf5852a9": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "fe96801369f6a148df2720f5ced88c19": { + "config_yaml": "model3.yaml" + }, + "02e8b226f85fb566e5db894b9931c640": { + "config_yaml": "model2.yaml" + }, + "e3de6d861635ab9c1d766149edd680d6": { + "config_yaml": "model1.yaml" + }, + "3f2936c554ab73ce2e396d54636bd373": { + "config_yaml": "modelB.yaml" + }, + "890d0f6f82d7574bca741a9e8bcb8168": { + "config_yaml": "modelB.yaml" + }, + "63a3cb8c37c474681049be4ad1ba8815": { + "config_yaml": "modelB.yaml" + }, + "a7fc5d719743c7fd6b61bd2b4d48b9f0": { + "config_yaml": "modelA.yaml" + }, + "3567f3dee6e77bf366fcb1c7b8bc3745": { + "config_yaml": "modelA.yaml" + }, + "a28f4d717bd0d34cd2ff7a3b0a3d065e": { + "config_yaml": "modelA.yaml" + }, + "c9971a18da20911822593dc81caa8be9": { + "config_yaml": "sndfx.yaml" + }, + "57d94d5ed705460d21c75a5ac829a605": { + "config_yaml": "sndfx.yaml" + }, + "e7a25f8764f25a52c1b96c4946e66ba2": { + "config_yaml": "sndfx.yaml" + }, + "104081d24e37217086ce5fde09147ee1": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "1e6165b601539f38d0a9330f3facffeb": { + "config_yaml": "model_2_stem_061321.yaml" + }, + "fe0108464ce0d8271be5ab810891bd7c": { + "config_yaml": "model_2_stem_full_band.yaml" + } +} \ No newline at end of file diff --git a/packages.txt b/packages.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ea7c73c110e77879f55380ebd7d49e2bf2f2430 --- /dev/null +++ b/packages.txt @@ -0,0 +1,3 @@ +git-lfs +aria2 -y +ffmpeg \ No newline at end of file diff --git a/pre-requirements.txt b/pre-requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..8af378824a0ef69080263b260075e313793f8c22 --- /dev/null +++ b/pre-requirements.txt @@ -0,0 +1,15 @@ +--extra-index-url https://download.pytorch.org/whl/cu118 +torch>=2.1.0+cu118 +torchvision>=0.16.0+cu118 +torchaudio>=2.1.0+cu118 +yt-dlp +gradio==4.19.2 +pydub==0.25.1 +edge_tts==6.1.7 +deep_translator==1.11.4 +git+https://github.com/R3gm/pyannote-audio.git@3.1.1 +git+https://github.com/R3gm/whisperX.git@cuda_11_8 +nest_asyncio +gTTS +gradio_client==0.10.1 +IPython diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..058fb3f0c740a359b90d7e7586ee990f840badbf --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +praat-parselmouth>=0.4.3 +pyworld==0.3.2 +faiss-cpu==1.7.3 +torchcrepe==0.0.20 +ffmpeg-python>=0.2.0 +fairseq==0.12.2 +gdown +rarfile +transformers +accelerate +optimum +sentencepiece +srt +git+https://github.com/R3gm/openvoice_package.git@lite +openai==1.14.3 +tiktoken==0.6.0 +# Documents +pypdf==4.2.0 +python-docx \ No newline at end of file diff --git a/requirements_xtts.txt b/requirements_xtts.txt new file mode 100644 index 0000000000000000000000000000000000000000..888bd860222e1971640b165a401b71cfbbae45e0 --- /dev/null +++ b/requirements_xtts.txt @@ -0,0 +1,58 @@ +# core deps +numpy==1.23.5 +cython>=0.29.30 +scipy>=1.11.2 +torch +torchaudio +soundfile +librosa +scikit-learn +numba +inflect>=5.6.0 +tqdm>=4.64.1 +anyascii>=0.3.0 +pyyaml>=6.0 +fsspec>=2023.6.0 # <= 2023.9.1 makes aux tests fail +aiohttp>=3.8.1 +packaging>=23.1 +# deps for examples +flask>=2.0.1 +# deps for inference +pysbd>=0.3.4 +# deps for notebooks +umap-learn>=0.5.1 +pandas +# deps for training +matplotlib +# coqui stack +trainer>=0.0.32 +# config management +coqpit>=0.0.16 +# chinese g2p deps +jieba +pypinyin +# korean +hangul_romanize +# gruut+supported langs +gruut[de,es,fr]==2.2.3 +# deps for korean +jamo +nltk +g2pkk>=0.1.1 +# deps for bangla +bangla +bnnumerizer +bnunicodenormalizer +#deps for tortoise +einops>=0.6.0 +transformers +#deps for bark +encodec>=0.1.1 +# deps for XTTS +unidecode>=1.3.2 +num2words +spacy[ja]>=3 + +# after this +# pip install -r requirements_xtts.txt +# pip install TTS==0.21.1 --no-deps \ No newline at end of file diff --git a/soni_translate/audio_segments.py b/soni_translate/audio_segments.py new file mode 100644 index 0000000000000000000000000000000000000000..105c6ba2dabd73cc65864724c861f9a034f983a3 --- /dev/null +++ b/soni_translate/audio_segments.py @@ -0,0 +1,141 @@ +from pydub import AudioSegment +from tqdm import tqdm +from .utils import run_command +from .logging_setup import logger +import numpy as np + + +class Mixer: + def __init__(self): + self.parts = [] + + def __len__(self): + parts = self._sync() + seg = parts[0][1] + frame_count = max(offset + seg.frame_count() for offset, seg in parts) + return int(1000.0 * frame_count / seg.frame_rate) + + def overlay(self, sound, position=0): + self.parts.append((position, sound)) + return self + + def _sync(self): + positions, segs = zip(*self.parts) + + frame_rate = segs[0].frame_rate + array_type = segs[0].array_type # noqa + + offsets = [int(frame_rate * pos / 1000.0) for pos in positions] + segs = AudioSegment.empty()._sync(*segs) + return list(zip(offsets, segs)) + + def append(self, sound): + self.overlay(sound, position=len(self)) + + def to_audio_segment(self): + parts = self._sync() + seg = parts[0][1] + channels = seg.channels + + frame_count = max(offset + seg.frame_count() for offset, seg in parts) + sample_count = int(frame_count * seg.channels) + + output = np.zeros(sample_count, dtype="int32") + for offset, seg in parts: + sample_offset = offset * channels + samples = np.frombuffer(seg.get_array_of_samples(), dtype="int32") + samples = np.int16(samples/np.max(np.abs(samples)) * 32767) + start = sample_offset + end = start + len(samples) + output[start:end] += samples + + return seg._spawn( + output, overrides={"sample_width": 4}).normalize(headroom=0.0) + + +def create_translated_audio( + result_diarize, audio_files, final_file, concat=False, avoid_overlap=False, +): + total_duration = result_diarize["segments"][-1]["end"] # in seconds + + if concat: + """ + file .\audio\1.ogg + file .\audio\2.ogg + file .\audio\3.ogg + file .\audio\4.ogg + ... + """ + + # Write the file paths to list.txt + with open("list.txt", "w") as file: + for i, audio_file in enumerate(audio_files): + if i == len(audio_files) - 1: # Check if it's the last item + file.write(f"file {audio_file}") + else: + file.write(f"file {audio_file}\n") + + # command = f"ffmpeg -f concat -safe 0 -i list.txt {final_file}" + command = ( + f"ffmpeg -f concat -safe 0 -i list.txt -c:a pcm_s16le {final_file}" + ) + run_command(command) + + else: + # silent audio with total_duration + base_audio = AudioSegment.silent( + duration=int(total_duration * 1000), frame_rate=41000 + ) + combined_audio = Mixer() + combined_audio.overlay(base_audio) + + logger.debug( + f"Audio duration: {total_duration // 60} " + f"minutes and {int(total_duration % 60)} seconds" + ) + + last_end_time = 0 + previous_speaker = "" + for line, audio_file in tqdm( + zip(result_diarize["segments"], audio_files) + ): + start = float(line["start"]) + + # Overlay each audio at the corresponding time + try: + audio = AudioSegment.from_file(audio_file) + # audio_a = audio.speedup(playback_speed=1.5) + + if avoid_overlap: + speaker = line["speaker"] + if (last_end_time - 0.500) > start: + overlap_time = last_end_time - start + if previous_speaker and previous_speaker != speaker: + start = (last_end_time - 0.500) + else: + start = (last_end_time - 0.200) + if overlap_time > 2.5: + start = start - 0.3 + logger.info( + f"Avoid overlap for {str(audio_file)} " + f"with {str(start)}" + ) + + previous_speaker = speaker + + duration_tts_seconds = len(audio) / 1000.0 # to sec + last_end_time = (start + duration_tts_seconds) + + start_time = start * 1000 # to ms + combined_audio = combined_audio.overlay( + audio, position=start_time + ) + except Exception as error: + logger.debug(str(error)) + logger.error(f"Error audio file {audio_file}") + + # combined audio as a file + combined_audio_data = combined_audio.to_audio_segment() + combined_audio_data.export( + final_file, format="wav" + ) # best than ogg, change if the audio is anomalous diff --git a/soni_translate/language_configuration.py b/soni_translate/language_configuration.py new file mode 100644 index 0000000000000000000000000000000000000000..e697e2861caac2fe0358acca6ad86bf00abc51f2 --- /dev/null +++ b/soni_translate/language_configuration.py @@ -0,0 +1,551 @@ +from .logging_setup import logger + +LANGUAGES_UNIDIRECTIONAL = { + "Aymara (ay)": "ay", + "Bambara (bm)": "bm", + "Cebuano (ceb)": "ceb", + "Chichewa (ny)": "ny", + "Divehi (dv)": "dv", + "Dogri (doi)": "doi", + "Ewe (ee)": "ee", + "Guarani (gn)": "gn", + "Iloko (ilo)": "ilo", + "Kinyarwanda (rw)": "rw", + "Krio (kri)": "kri", + "Kurdish (ku)": "ku", + "Kirghiz (ky)": "ky", + "Ganda (lg)": "lg", + "Maithili (mai)": "mai", + "Oriya (or)": "or", + "Oromo (om)": "om", + "Quechua (qu)": "qu", + "Samoan (sm)": "sm", + "Tigrinya (ti)": "ti", + "Tsonga (ts)": "ts", + "Akan (ak)": "ak", + "Uighur (ug)": "ug" +} + +UNIDIRECTIONAL_L_LIST = LANGUAGES_UNIDIRECTIONAL.keys() + +LANGUAGES = { + "Automatic detection": "Automatic detection", + "Arabic (ar)": "ar", + "Chinese - Simplified (zh-CN)": "zh", + "Czech (cs)": "cs", + "Danish (da)": "da", + "Dutch (nl)": "nl", + "English (en)": "en", + "Finnish (fi)": "fi", + "French (fr)": "fr", + "German (de)": "de", + "Greek (el)": "el", + "Hebrew (he)": "he", + "Hungarian (hu)": "hu", + "Italian (it)": "it", + "Japanese (ja)": "ja", + "Korean (ko)": "ko", + "Persian (fa)": "fa", # no aux gTTS + "Polish (pl)": "pl", + "Portuguese (pt)": "pt", + "Russian (ru)": "ru", + "Spanish (es)": "es", + "Turkish (tr)": "tr", + "Ukrainian (uk)": "uk", + "Urdu (ur)": "ur", + "Vietnamese (vi)": "vi", + "Hindi (hi)": "hi", + "Indonesian (id)": "id", + "Bengali (bn)": "bn", + "Telugu (te)": "te", + "Marathi (mr)": "mr", + "Tamil (ta)": "ta", + "Javanese (jw|jv)": "jw", + "Catalan (ca)": "ca", + "Nepali (ne)": "ne", + "Thai (th)": "th", + "Swedish (sv)": "sv", + "Amharic (am)": "am", + "Welsh (cy)": "cy", # no aux gTTS + "Estonian (et)": "et", + "Croatian (hr)": "hr", + "Icelandic (is)": "is", + "Georgian (ka)": "ka", # no aux gTTS + "Khmer (km)": "km", + "Slovak (sk)": "sk", + "Albanian (sq)": "sq", + "Serbian (sr)": "sr", + "Azerbaijani (az)": "az", # no aux gTTS + "Bulgarian (bg)": "bg", + "Galician (gl)": "gl", # no aux gTTS + "Gujarati (gu)": "gu", + "Kazakh (kk)": "kk", # no aux gTTS + "Kannada (kn)": "kn", + "Lithuanian (lt)": "lt", # no aux gTTS + "Latvian (lv)": "lv", + "Macedonian (mk)": "mk", # no aux gTTS # error get align model + "Malayalam (ml)": "ml", + "Malay (ms)": "ms", # error get align model + "Romanian (ro)": "ro", + "Sinhala (si)": "si", + "Sundanese (su)": "su", + "Swahili (sw)": "sw", # error aling + "Afrikaans (af)": "af", + "Bosnian (bs)": "bs", + "Latin (la)": "la", + "Myanmar Burmese (my)": "my", + "Norwegian (no|nb)": "no", + "Chinese - Traditional (zh-TW)": "zh-TW", + "Assamese (as)": "as", + "Basque (eu)": "eu", + "Hausa (ha)": "ha", + "Haitian Creole (ht)": "ht", + "Armenian (hy)": "hy", + "Lao (lo)": "lo", + "Malagasy (mg)": "mg", + "Mongolian (mn)": "mn", + "Maltese (mt)": "mt", + "Punjabi (pa)": "pa", + "Pashto (ps)": "ps", + "Slovenian (sl)": "sl", + "Shona (sn)": "sn", + "Somali (so)": "so", + "Tajik (tg)": "tg", + "Turkmen (tk)": "tk", + "Tatar (tt)": "tt", + "Uzbek (uz)": "uz", + "Yoruba (yo)": "yo", + **LANGUAGES_UNIDIRECTIONAL +} + +BASE_L_LIST = LANGUAGES.keys() +LANGUAGES_LIST = [list(BASE_L_LIST)[0]] + sorted(list(BASE_L_LIST)[1:]) +INVERTED_LANGUAGES = {value: key for key, value in LANGUAGES.items()} + +EXTRA_ALIGN = { + "id": "indonesian-nlp/wav2vec2-large-xlsr-indonesian", + "bn": "arijitx/wav2vec2-large-xlsr-bengali", + "mr": "sumedh/wav2vec2-large-xlsr-marathi", + "ta": "Amrrs/wav2vec2-large-xlsr-53-tamil", + "jw": "cahya/wav2vec2-large-xlsr-javanese", + "ne": "shniranjan/wav2vec2-large-xlsr-300m-nepali", + "th": "sakares/wav2vec2-large-xlsr-thai-demo", + "sv": "KBLab/wav2vec2-large-voxrex-swedish", + "am": "agkphysics/wav2vec2-large-xlsr-53-amharic", + "cy": "Srulikbdd/Wav2Vec2-large-xlsr-welsh", + "et": "anton-l/wav2vec2-large-xlsr-53-estonian", + "hr": "classla/wav2vec2-xls-r-parlaspeech-hr", + "is": "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-icelandic-ep10-1000h", + "ka": "MehdiHosseiniMoghadam/wav2vec2-large-xlsr-53-Georgian", + "km": "vitouphy/wav2vec2-xls-r-300m-khmer", + "sk": "infinitejoy/wav2vec2-large-xls-r-300m-slovak", + "sq": "Alimzhan/wav2vec2-large-xls-r-300m-albanian-colab", + "sr": "dnikolic/wav2vec2-xlsr-530-serbian-colab", + "az": "nijatzeynalov/wav2vec2-large-mms-1b-azerbaijani-common_voice15.0", + "bg": "infinitejoy/wav2vec2-large-xls-r-300m-bulgarian", + "gl": "ifrz/wav2vec2-large-xlsr-galician", + "gu": "Harveenchadha/vakyansh-wav2vec2-gujarati-gnm-100", + "kk": "aismlv/wav2vec2-large-xlsr-kazakh", + "kn": "Harveenchadha/vakyansh-wav2vec2-kannada-knm-560", + "lt": "DeividasM/wav2vec2-large-xlsr-53-lithuanian", + "lv": "anton-l/wav2vec2-large-xlsr-53-latvian", + "mk": "", # Konstantin-Bogdanoski/wav2vec2-macedonian-base + "ml": "gvs/wav2vec2-large-xlsr-malayalam", + "ms": "", # Duy/wav2vec2_malay + "ro": "anton-l/wav2vec2-large-xlsr-53-romanian", + "si": "IAmNotAnanth/wav2vec2-large-xls-r-300m-sinhala", + "su": "cahya/wav2vec2-large-xlsr-sundanese", + "sw": "", # Lians/fine-tune-wav2vec2-large-swahili + "af": "", # ylacombe/wav2vec2-common_voice-af-demo + "bs": "", + "la": "", + "my": "", + "no": "NbAiLab/wav2vec2-xlsr-300m-norwegian", + "zh-TW": "jonatasgrosman/wav2vec2-large-xlsr-53-chinese-zh-cn", + "as": "", + "eu": "", # cahya/wav2vec2-large-xlsr-basque # verify + "ha": "infinitejoy/wav2vec2-large-xls-r-300m-hausa", + "ht": "", + "hy": "infinitejoy/wav2vec2-large-xls-r-300m-armenian", # no (.) + "lo": "", + "mg": "", + "mn": "tugstugi/wav2vec2-large-xlsr-53-mongolian", + "mt": "carlosdanielhernandezmena/wav2vec2-large-xlsr-53-maltese-64h", + "pa": "kingabzpro/wav2vec2-large-xlsr-53-punjabi", + "ps": "aamirhs/wav2vec2-large-xls-r-300m-pashto-colab", + "sl": "anton-l/wav2vec2-large-xlsr-53-slovenian", + "sn": "", + "so": "", + "tg": "", + "tk": "", # Ragav/wav2vec2-tk + "tt": "anton-l/wav2vec2-large-xlsr-53-tatar", + "uz": "", # Mekhriddin/wav2vec2-large-xls-r-300m-uzbek-colab + "yo": "ogbi/wav2vec2-large-mms-1b-yoruba-test", +} + + +def fix_code_language(translate_to, syntax="google"): + if syntax == "google": + # google-translator, gTTS + replace_lang_code = {"zh": "zh-CN", "he": "iw", "zh-cn": "zh-CN"} + elif syntax == "coqui": + # coqui-xtts + replace_lang_code = {"zh": "zh-cn", "zh-CN": "zh-cn", "zh-TW": "zh-cn"} + + new_code_lang = replace_lang_code.get(translate_to, translate_to) + logger.debug(f"Fix code {translate_to} -> {new_code_lang}") + return new_code_lang + + +BARK_VOICES_LIST = { + "de_speaker_0-Male BARK": "v2/de_speaker_0", + "de_speaker_1-Male BARK": "v2/de_speaker_1", + "de_speaker_2-Male BARK": "v2/de_speaker_2", + "de_speaker_3-Female BARK": "v2/de_speaker_3", + "de_speaker_4-Male BARK": "v2/de_speaker_4", + "de_speaker_5-Male BARK": "v2/de_speaker_5", + "de_speaker_6-Male BARK": "v2/de_speaker_6", + "de_speaker_7-Male BARK": "v2/de_speaker_7", + "de_speaker_8-Female BARK": "v2/de_speaker_8", + "de_speaker_9-Male BARK": "v2/de_speaker_9", + "en_speaker_0-Male BARK": "v2/en_speaker_0", + "en_speaker_1-Male BARK": "v2/en_speaker_1", + "en_speaker_2-Male BARK": "v2/en_speaker_2", + "en_speaker_3-Male BARK": "v2/en_speaker_3", + "en_speaker_4-Male BARK": "v2/en_speaker_4", + "en_speaker_5-Male BARK": "v2/en_speaker_5", + "en_speaker_6-Male BARK": "v2/en_speaker_6", + "en_speaker_7-Male BARK": "v2/en_speaker_7", + "en_speaker_8-Male BARK": "v2/en_speaker_8", + "en_speaker_9-Female BARK": "v2/en_speaker_9", + "es_speaker_0-Male BARK": "v2/es_speaker_0", + "es_speaker_1-Male BARK": "v2/es_speaker_1", + "es_speaker_2-Male BARK": "v2/es_speaker_2", + "es_speaker_3-Male BARK": "v2/es_speaker_3", + "es_speaker_4-Male BARK": "v2/es_speaker_4", + "es_speaker_5-Male BARK": "v2/es_speaker_5", + "es_speaker_6-Male BARK": "v2/es_speaker_6", + "es_speaker_7-Male BARK": "v2/es_speaker_7", + "es_speaker_8-Female BARK": "v2/es_speaker_8", + "es_speaker_9-Female BARK": "v2/es_speaker_9", + "fr_speaker_0-Male BARK": "v2/fr_speaker_0", + "fr_speaker_1-Female BARK": "v2/fr_speaker_1", + "fr_speaker_2-Female BARK": "v2/fr_speaker_2", + "fr_speaker_3-Male BARK": "v2/fr_speaker_3", + "fr_speaker_4-Male BARK": "v2/fr_speaker_4", + "fr_speaker_5-Female BARK": "v2/fr_speaker_5", + "fr_speaker_6-Male BARK": "v2/fr_speaker_6", + "fr_speaker_7-Male BARK": "v2/fr_speaker_7", + "fr_speaker_8-Male BARK": "v2/fr_speaker_8", + "fr_speaker_9-Male BARK": "v2/fr_speaker_9", + "hi_speaker_0-Female BARK": "v2/hi_speaker_0", + "hi_speaker_1-Female BARK": "v2/hi_speaker_1", + "hi_speaker_2-Male BARK": "v2/hi_speaker_2", + "hi_speaker_3-Female BARK": "v2/hi_speaker_3", + "hi_speaker_4-Female BARK": "v2/hi_speaker_4", + "hi_speaker_5-Male BARK": "v2/hi_speaker_5", + "hi_speaker_6-Male BARK": "v2/hi_speaker_6", + "hi_speaker_7-Male BARK": "v2/hi_speaker_7", + "hi_speaker_8-Male BARK": "v2/hi_speaker_8", + "hi_speaker_9-Female BARK": "v2/hi_speaker_9", + "it_speaker_0-Male BARK": "v2/it_speaker_0", + "it_speaker_1-Male BARK": "v2/it_speaker_1", + "it_speaker_2-Female BARK": "v2/it_speaker_2", + "it_speaker_3-Male BARK": "v2/it_speaker_3", + "it_speaker_4-Male BARK": "v2/it_speaker_4", + "it_speaker_5-Male BARK": "v2/it_speaker_5", + "it_speaker_6-Male BARK": "v2/it_speaker_6", + "it_speaker_7-Female BARK": "v2/it_speaker_7", + "it_speaker_8-Male BARK": "v2/it_speaker_8", + "it_speaker_9-Female BARK": "v2/it_speaker_9", + "ja_speaker_0-Female BARK": "v2/ja_speaker_0", + "ja_speaker_1-Female BARK": "v2/ja_speaker_1", + "ja_speaker_2-Male BARK": "v2/ja_speaker_2", + "ja_speaker_3-Female BARK": "v2/ja_speaker_3", + "ja_speaker_4-Female BARK": "v2/ja_speaker_4", + "ja_speaker_5-Female BARK": "v2/ja_speaker_5", + "ja_speaker_6-Male BARK": "v2/ja_speaker_6", + "ja_speaker_7-Female BARK": "v2/ja_speaker_7", + "ja_speaker_8-Female BARK": "v2/ja_speaker_8", + "ja_speaker_9-Female BARK": "v2/ja_speaker_9", + "ko_speaker_0-Female BARK": "v2/ko_speaker_0", + "ko_speaker_1-Male BARK": "v2/ko_speaker_1", + "ko_speaker_2-Male BARK": "v2/ko_speaker_2", + "ko_speaker_3-Male BARK": "v2/ko_speaker_3", + "ko_speaker_4-Male BARK": "v2/ko_speaker_4", + "ko_speaker_5-Male BARK": "v2/ko_speaker_5", + "ko_speaker_6-Male BARK": "v2/ko_speaker_6", + "ko_speaker_7-Male BARK": "v2/ko_speaker_7", + "ko_speaker_8-Male BARK": "v2/ko_speaker_8", + "ko_speaker_9-Male BARK": "v2/ko_speaker_9", + "pl_speaker_0-Male BARK": "v2/pl_speaker_0", + "pl_speaker_1-Male BARK": "v2/pl_speaker_1", + "pl_speaker_2-Male BARK": "v2/pl_speaker_2", + "pl_speaker_3-Male BARK": "v2/pl_speaker_3", + "pl_speaker_4-Female BARK": "v2/pl_speaker_4", + "pl_speaker_5-Male BARK": "v2/pl_speaker_5", + "pl_speaker_6-Female BARK": "v2/pl_speaker_6", + "pl_speaker_7-Male BARK": "v2/pl_speaker_7", + "pl_speaker_8-Male BARK": "v2/pl_speaker_8", + "pl_speaker_9-Female BARK": "v2/pl_speaker_9", + "pt_speaker_0-Male BARK": "v2/pt_speaker_0", + "pt_speaker_1-Male BARK": "v2/pt_speaker_1", + "pt_speaker_2-Male BARK": "v2/pt_speaker_2", + "pt_speaker_3-Male BARK": "v2/pt_speaker_3", + "pt_speaker_4-Male BARK": "v2/pt_speaker_4", + "pt_speaker_5-Male BARK": "v2/pt_speaker_5", + "pt_speaker_6-Male BARK": "v2/pt_speaker_6", + "pt_speaker_7-Male BARK": "v2/pt_speaker_7", + "pt_speaker_8-Male BARK": "v2/pt_speaker_8", + "pt_speaker_9-Male BARK": "v2/pt_speaker_9", + "ru_speaker_0-Male BARK": "v2/ru_speaker_0", + "ru_speaker_1-Male BARK": "v2/ru_speaker_1", + "ru_speaker_2-Male BARK": "v2/ru_speaker_2", + "ru_speaker_3-Male BARK": "v2/ru_speaker_3", + "ru_speaker_4-Male BARK": "v2/ru_speaker_4", + "ru_speaker_5-Female BARK": "v2/ru_speaker_5", + "ru_speaker_6-Female BARK": "v2/ru_speaker_6", + "ru_speaker_7-Male BARK": "v2/ru_speaker_7", + "ru_speaker_8-Male BARK": "v2/ru_speaker_8", + "ru_speaker_9-Female BARK": "v2/ru_speaker_9", + "tr_speaker_0-Male BARK": "v2/tr_speaker_0", + "tr_speaker_1-Male BARK": "v2/tr_speaker_1", + "tr_speaker_2-Male BARK": "v2/tr_speaker_2", + "tr_speaker_3-Male BARK": "v2/tr_speaker_3", + "tr_speaker_4-Female BARK": "v2/tr_speaker_4", + "tr_speaker_5-Female BARK": "v2/tr_speaker_5", + "tr_speaker_6-Male BARK": "v2/tr_speaker_6", + "tr_speaker_7-Male BARK": "v2/tr_speaker_7", + "tr_speaker_8-Male BARK": "v2/tr_speaker_8", + "tr_speaker_9-Male BARK": "v2/tr_speaker_9", + "zh_speaker_0-Male BARK": "v2/zh_speaker_0", + "zh_speaker_1-Male BARK": "v2/zh_speaker_1", + "zh_speaker_2-Male BARK": "v2/zh_speaker_2", + "zh_speaker_3-Male BARK": "v2/zh_speaker_3", + "zh_speaker_4-Female BARK": "v2/zh_speaker_4", + "zh_speaker_5-Male BARK": "v2/zh_speaker_5", + "zh_speaker_6-Female BARK": "v2/zh_speaker_6", + "zh_speaker_7-Female BARK": "v2/zh_speaker_7", + "zh_speaker_8-Male BARK": "v2/zh_speaker_8", + "zh_speaker_9-Female BARK": "v2/zh_speaker_9", +} + +VITS_VOICES_LIST = { + "ar-facebook-mms VITS": "facebook/mms-tts-ara", + # 'zh-facebook-mms VITS': 'facebook/mms-tts-cmn', + "zh_Hakka-facebook-mms VITS": "facebook/mms-tts-hak", + "zh_MinNan-facebook-mms VITS": "facebook/mms-tts-nan", + # 'cs-facebook-mms VITS': 'facebook/mms-tts-ces', + # 'da-facebook-mms VITS': 'facebook/mms-tts-dan', + "nl-facebook-mms VITS": "facebook/mms-tts-nld", + "en-facebook-mms VITS": "facebook/mms-tts-eng", + "fi-facebook-mms VITS": "facebook/mms-tts-fin", + "fr-facebook-mms VITS": "facebook/mms-tts-fra", + "de-facebook-mms VITS": "facebook/mms-tts-deu", + "el-facebook-mms VITS": "facebook/mms-tts-ell", + "el_Ancient-facebook-mms VITS": "facebook/mms-tts-grc", + "he-facebook-mms VITS": "facebook/mms-tts-heb", + "hu-facebook-mms VITS": "facebook/mms-tts-hun", + # 'it-facebook-mms VITS': 'facebook/mms-tts-ita', + # 'ja-facebook-mms VITS': 'facebook/mms-tts-jpn', + "ko-facebook-mms VITS": "facebook/mms-tts-kor", + "fa-facebook-mms VITS": "facebook/mms-tts-fas", + "pl-facebook-mms VITS": "facebook/mms-tts-pol", + "pt-facebook-mms VITS": "facebook/mms-tts-por", + "ru-facebook-mms VITS": "facebook/mms-tts-rus", + "es-facebook-mms VITS": "facebook/mms-tts-spa", + "tr-facebook-mms VITS": "facebook/mms-tts-tur", + "uk-facebook-mms VITS": "facebook/mms-tts-ukr", + "ur_arabic-facebook-mms VITS": "facebook/mms-tts-urd-script_arabic", + "ur_devanagari-facebook-mms VITS": "facebook/mms-tts-urd-script_devanagari", + "ur_latin-facebook-mms VITS": "facebook/mms-tts-urd-script_latin", + "vi-facebook-mms VITS": "facebook/mms-tts-vie", + "hi-facebook-mms VITS": "facebook/mms-tts-hin", + "hi_Fiji-facebook-mms VITS": "facebook/mms-tts-hif", + "id-facebook-mms VITS": "facebook/mms-tts-ind", + "bn-facebook-mms VITS": "facebook/mms-tts-ben", + "te-facebook-mms VITS": "facebook/mms-tts-tel", + "mr-facebook-mms VITS": "facebook/mms-tts-mar", + "ta-facebook-mms VITS": "facebook/mms-tts-tam", + "jw-facebook-mms VITS": "facebook/mms-tts-jav", + "jw_Suriname-facebook-mms VITS": "facebook/mms-tts-jvn", + "ca-facebook-mms VITS": "facebook/mms-tts-cat", + "ne-facebook-mms VITS": "facebook/mms-tts-nep", + "th-facebook-mms VITS": "facebook/mms-tts-tha", + "th_Northern-facebook-mms VITS": "facebook/mms-tts-nod", + "sv-facebook-mms VITS": "facebook/mms-tts-swe", + "am-facebook-mms VITS": "facebook/mms-tts-amh", + "cy-facebook-mms VITS": "facebook/mms-tts-cym", + # "et-facebook-mms VITS": "facebook/mms-tts-est", + # "ht-facebook-mms VITS": "facebook/mms-tts-hrv", + "is-facebook-mms VITS": "facebook/mms-tts-isl", + "km-facebook-mms VITS": "facebook/mms-tts-khm", + "km_Northern-facebook-mms VITS": "facebook/mms-tts-kxm", + # "sk-facebook-mms VITS": "facebook/mms-tts-slk", + "sq_Northern-facebook-mms VITS": "facebook/mms-tts-sqi", + "az_South-facebook-mms VITS": "facebook/mms-tts-azb", + "az_North_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-azj-script_cyrillic", + "az_North_script_latin-facebook-mms VITS": "facebook/mms-tts-azj-script_latin", + "bg-facebook-mms VITS": "facebook/mms-tts-bul", + # "gl-facebook-mms VITS": "facebook/mms-tts-glg", + "gu-facebook-mms VITS": "facebook/mms-tts-guj", + "kk-facebook-mms VITS": "facebook/mms-tts-kaz", + "kn-facebook-mms VITS": "facebook/mms-tts-kan", + # "lt-facebook-mms VITS": "facebook/mms-tts-lit", + "lv-facebook-mms VITS": "facebook/mms-tts-lav", + # "mk-facebook-mms VITS": "facebook/mms-tts-mkd", + "ml-facebook-mms VITS": "facebook/mms-tts-mal", + "ms-facebook-mms VITS": "facebook/mms-tts-zlm", + "ms_Central-facebook-mms VITS": "facebook/mms-tts-pse", + "ms_Manado-facebook-mms VITS": "facebook/mms-tts-xmm", + "ro-facebook-mms VITS": "facebook/mms-tts-ron", + # "si-facebook-mms VITS": "facebook/mms-tts-sin", + "sw-facebook-mms VITS": "facebook/mms-tts-swh", + # "af-facebook-mms VITS": "facebook/mms-tts-afr", + # "bs-facebook-mms VITS": "facebook/mms-tts-bos", + "la-facebook-mms VITS": "facebook/mms-tts-lat", + "my-facebook-mms VITS": "facebook/mms-tts-mya", + # "no_Bokmål-facebook-mms VITS": "thomasht86/mms-tts-nob", # verify + "as-facebook-mms VITS": "facebook/mms-tts-asm", + "as_Nagamese-facebook-mms VITS": "facebook/mms-tts-nag", + "eu-facebook-mms VITS": "facebook/mms-tts-eus", + "ha-facebook-mms VITS": "facebook/mms-tts-hau", + "ht-facebook-mms VITS": "facebook/mms-tts-hat", + "hy_Western-facebook-mms VITS": "facebook/mms-tts-hyw", + "lo-facebook-mms VITS": "facebook/mms-tts-lao", + "mg-facebook-mms VITS": "facebook/mms-tts-mlg", + "mn-facebook-mms VITS": "facebook/mms-tts-mon", + # "mt-facebook-mms VITS": "facebook/mms-tts-mlt", + "pa_Eastern-facebook-mms VITS": "facebook/mms-tts-pan", + # "pa_Western-facebook-mms VITS": "facebook/mms-tts-pnb", + # "ps-facebook-mms VITS": "facebook/mms-tts-pus", + # "sl-facebook-mms VITS": "facebook/mms-tts-slv", + "sn-facebook-mms VITS": "facebook/mms-tts-sna", + "so-facebook-mms VITS": "facebook/mms-tts-son", + "tg-facebook-mms VITS": "facebook/mms-tts-tgk", + "tk_script_arabic-facebook-mms VITS": "facebook/mms-tts-tuk-script_arabic", + "tk_script_latin-facebook-mms VITS": "facebook/mms-tts-tuk-script_latin", + "tt-facebook-mms VITS": "facebook/mms-tts-tat", + "tt_Crimean-facebook-mms VITS": "facebook/mms-tts-crh", + "uz_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uzb-script_cyrillic", + "yo-facebook-mms VITS": "facebook/mms-tts-yor", + "ay-facebook-mms VITS": "facebook/mms-tts-ayr", + "bm-facebook-mms VITS": "facebook/mms-tts-bam", + "ceb-facebook-mms VITS": "facebook/mms-tts-ceb", + "ny-facebook-mms VITS": "facebook/mms-tts-nya", + "dv-facebook-mms VITS": "facebook/mms-tts-div", + "doi-facebook-mms VITS": "facebook/mms-tts-dgo", + "ee-facebook-mms VITS": "facebook/mms-tts-ewe", + "gn-facebook-mms VITS": "facebook/mms-tts-grn", + "ilo-facebook-mms VITS": "facebook/mms-tts-ilo", + "rw-facebook-mms VITS": "facebook/mms-tts-kin", + "kri-facebook-mms VITS": "facebook/mms-tts-kri", + "ku_script_arabic-facebook-mms VITS": "facebook/mms-tts-kmr-script_arabic", + "ku_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-kmr-script_cyrillic", + "ku_script_latin-facebook-mms VITS": "facebook/mms-tts-kmr-script_latin", + "ckb-facebook-mms VITS": "razhan/mms-tts-ckb", # Verify w + "ky-facebook-mms VITS": "facebook/mms-tts-kir", + "lg-facebook-mms VITS": "facebook/mms-tts-lug", + "mai-facebook-mms VITS": "facebook/mms-tts-mai", + "or-facebook-mms VITS": "facebook/mms-tts-ory", + "om-facebook-mms VITS": "facebook/mms-tts-orm", + "qu_Huallaga-facebook-mms VITS": "facebook/mms-tts-qub", + "qu_Lambayeque-facebook-mms VITS": "facebook/mms-tts-quf", + "qu_South_Bolivian-facebook-mms VITS": "facebook/mms-tts-quh", + "qu_North_Bolivian-facebook-mms VITS": "facebook/mms-tts-qul", + "qu_Tena_Lowland-facebook-mms VITS": "facebook/mms-tts-quw", + "qu_Ayacucho-facebook-mms VITS": "facebook/mms-tts-quy", + "qu_Cusco-facebook-mms VITS": "facebook/mms-tts-quz", + "qu_Cajamarca-facebook-mms VITS": "facebook/mms-tts-qvc", + "qu_Eastern_Apurímac-facebook-mms VITS": "facebook/mms-tts-qve", + "qu_Huamalíes_Dos_de_Mayo_Huánuco-facebook-mms VITS": "facebook/mms-tts-qvh", + "qu_Margos_Yarowilca_Lauricocha-facebook-mms VITS": "facebook/mms-tts-qvm", + "qu_North_Junín-facebook-mms VITS": "facebook/mms-tts-qvn", + "qu_Napo-facebook-mms VITS": "facebook/mms-tts-qvo", + "qu_San_Martín-facebook-mms VITS": "facebook/mms-tts-qvs", + "qu_Huaylla_Wanca-facebook-mms VITS": "facebook/mms-tts-qvw", + "qu_Northern_Pastaza-facebook-mms VITS": "facebook/mms-tts-qvz", + "qu_Huaylas_Ancash-facebook-mms VITS": "facebook/mms-tts-qwh", + "qu_Panao-facebook-mms VITS": "facebook/mms-tts-qxh", + "qu_Salasaca_Highland-facebook-mms VITS": "facebook/mms-tts-qxl", + "qu_Northern_Conchucos_Ancash-facebook-mms VITS": "facebook/mms-tts-qxn", + "qu_Southern_Conchucos-facebook-mms VITS": "facebook/mms-tts-qxo", + "qu_Cañar_Highland-facebook-mms VITS": "facebook/mms-tts-qxr", + "sm-facebook-mms VITS": "facebook/mms-tts-smo", + "ti-facebook-mms VITS": "facebook/mms-tts-tir", + "ts-facebook-mms VITS": "facebook/mms-tts-tso", + "ak-facebook-mms VITS": "facebook/mms-tts-aka", + "ug_script_arabic-facebook-mms VITS": "facebook/mms-tts-uig-script_arabic", + "ug_script_cyrillic-facebook-mms VITS": "facebook/mms-tts-uig-script_cyrillic", +} + +OPENAI_TTS_CODES = [ + "af", "ar", "hy", "az", "be", "bs", "bg", "ca", "zh", "hr", "cs", "da", + "nl", "en", "et", "fi", "fr", "gl", "de", "el", "he", "hi", "hu", "is", + "id", "it", "ja", "kn", "kk", "ko", "lv", "lt", "mk", "ms", "mr", "mi", + "ne", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sk", "sl", "es", "sw", + "sv", "tl", "ta", "th", "tr", "uk", "ur", "vi", "cy", "zh-TW" +] + +OPENAI_TTS_MODELS = [ + ">alloy OpenAI-TTS", + ">echo OpenAI-TTS", + ">fable OpenAI-TTS", + ">onyx OpenAI-TTS", + ">nova OpenAI-TTS", + ">shimmer OpenAI-TTS", + ">alloy HD OpenAI-TTS", + ">echo HD OpenAI-TTS", + ">fable HD OpenAI-TTS", + ">onyx HD OpenAI-TTS", + ">nova HD OpenAI-TTS", + ">shimmer HD OpenAI-TTS" +] + +LANGUAGE_CODE_IN_THREE_LETTERS = { + "Automatic detection": "aut", + "ar": "ara", + "zh": "chi", + "cs": "cze", + "da": "dan", + "nl": "dut", + "en": "eng", + "fi": "fin", + "fr": "fre", + "de": "ger", + "el": "gre", + "he": "heb", + "hu": "hun", + "it": "ita", + "ja": "jpn", + "ko": "kor", + "fa": "per", + "pl": "pol", + "pt": "por", + "ru": "rus", + "es": "spa", + "tr": "tur", + "uk": "ukr", + "ur": "urd", + "vi": "vie", + "hi": "hin", + "id": "ind", + "bn": "ben", + "te": "tel", + "mr": "mar", + "ta": "tam", + "jw": "jav", + "ca": "cat", + "ne": "nep", + "th": "tha", + "sv": "swe", + "am": "amh", + "cy": "cym", + "et": "est", + "hr": "hrv", + "is": "isl", + "km": "khm", + "sk": "slk", + "sq": "sqi", + "sr": "srp", +} diff --git a/soni_translate/languages_gui.py b/soni_translate/languages_gui.py new file mode 100644 index 0000000000000000000000000000000000000000..ea26babc8d03173d704ab479f87ae27cacbeff1d --- /dev/null +++ b/soni_translate/languages_gui.py @@ -0,0 +1,4208 @@ +# flake8: noqa + +news = """ ## 📖 News + + 🔥 2024/18/05: Overlap reduction. OpenAI API key integration for transcription, translation, and TTS. Output type: subtitles by speaker, separate audio sound, and video only with subtitles. Now you have access to a better-performing version of Whisper for transcribing speech. For example, you can use `kotoba-tech/kotoba-whisper-v1.1` for Japanese transcription, available [here](https://huggingface.co/kotoba-tech/kotoba-whisper-v1.1). You can find these improved models on the [Hugging Face Whisper page](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending&search=whisper). Simply copy the repository ID and paste it into the 'Whisper ASR model' in 'Advanced Settings'. Support for ass subtitles and batch processing with subtitles. Vocal enhancement before transcription. Added CPU mode with `app_rvc.py --cpu_mode`. TTS now supports up to 12 speakers. OpenVoiceV2 has been integrated for voice imitation. PDF to videobook (displays images from the PDF). + + 🔥 2024/03/02: Preserve file names in output. Multiple archives can now be submitted simultaneously by specifying their paths, directories or URLs separated by commas. Added option for disabling diarization. Implemented soft subtitles. Format output (MP3, MP4, MKV, WAV, and OGG), and resolved issues related to file reading and diarization. + + 🔥 2024/02/22: Added freevc for voice imitation, fixed voiceless track, divide segments. New languages support. New translations of the GUI. With subtitle file, no align and the media file is not needed to process the SRT file. Burn subtitles to video. Queue can accept multiple tasks simultaneously. Sound alert notification. Continue process from last checkpoint. Acceleration rate regulation + + 🔥 2024/01/16: Expanded language support, the introduction of whisper large v3, configurable GUI options, integration of BARK, Facebook-mms, Coqui XTTS, and Piper-TTS. Additional features included audio separation utilities, XTTS WAV creation, use an SRT file as a base for translation, document translation, manual speaker editing, and flexible output options (video, audio, subtitles). + + 🔥 2023/10/29: Edit the translated subtitle, download it, adjust volume and speed options. + + 🔥 2023/08/03: Changed default options and added directory view of downloads.. + + 🔥 2023/08/02: Added support for Arabic, Czech, Danish, Finnish, Greek, Hebrew, Hungarian, Korean, Persian, Polish, Russian, Turkish, Urdu, Hindi, and Vietnamese languages. 🌐 + + 🔥 2023/08/01: Add options for use R.V.C. models. + + 🔥 2023/07/27: Fix some bug processing the video and audio. + + 🔥 2023/07/26: New UI and add mix options. + """ + +language_data = { + "english": { + "description": """ + ### 🎥 **Translate videos easily with SoniTranslate!** 📽️ + + Upload a video, subtitle, audio file or provide a URL video link. 📽️ **Gets the updated notebook from the official repository.: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + See the tab `Help` for instructions on how to use it. Let's start having fun with video translation! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Instructions for use:** + + 1. 📤 Upload a **video**, **subtitle file**, **audio file**, or provide a 🌐 **URL link** to a video like YouTube. + + 2. 🌍 Choose the language in which you want to **translate the video**. + + 3. 🗣️ Specify the **number of people speaking** in the video and **assign each one a text-to-speech voice** suitable for the translation language. + + 4. 🚀 Press the '**Translate**' button to obtain the results. + + --- + + # 🧩 **SoniTranslate supports different TTS (Text-to-Speech) engines, which are:** + - EDGE-TTS → format `en-AU-WilliamNeural-Male` → Fast and accurate. + - FACEBOOK MMS → format `en-facebook-mms VITS` → The voice is more natural; at the moment, it only uses CPU. + - PIPER TTS → format `en_US-lessac-high VITS-onnx` → Same as the previous one, but it is optimized for both CPU and GPU. + - BARK → format `en_speaker_0-Male BARK` → Good quality but slow, and it is prone to hallucinations. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Multilingual but it needs an OpenAI API key. + - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Only available for Chinese (Simplified), English, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Spanish, Hungarian, Korean and Japanese. + + --- + + # 🎤 How to Use R.V.C. and R.V.C.2 Voices (Optional) 🎶 + + The goal is to apply a R.V.C. to the generated TTS (Text-to-Speech) 🎙️ + + 1. In the `Custom Voice R.V.C.` tab, download the models you need 📥 You can use links from Hugging Face and Google Drive in formats like zip, pth, or index. You can also download complete HF space repositories, but this option is not very stable 😕 + + 2. Now, go to `Replace voice: TTS to R.V.C.` and check the `enable` box ✅ After this, you can choose the models you want to apply to each TTS speaker 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Adjust the F0 method that will be applied to all R.V.C. 🎛️ + + 4. Press `APPLY CONFIGURATION` to apply the changes you made 🔄 + + 5. Go back to the video translation tab and click on 'Translate' ▶️ Now, the translation will be done applying the R.V.C. 🗣️ + + Tip: You can use `Test R.V.C.` to experiment and find the best TTS or configurations to apply to the R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "Video translation", + "video_source": "Choose Video Source", + "link_label": "Media link.", + "link_info": "Example: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL goes here...", + "dir_label": "Video Path.", + "dir_info": "Example: /usr/home/my_video.mp4", + "dir_ph": "Path goes here...", + "sl_label": "Source language", + "sl_info": "This is the original language of the video", + "tat_label": "Translate audio to", + "tat_info": "Select the target language and also make sure to choose the corresponding TTS for that language.", + "num_speakers": "Select how many people are speaking in the video.", + "min_sk": "Min speakers", + "max_sk": "Max speakers", + "tts_select": "Select the voice you want for each speaker.", + "sk1": "TTS Speaker 1", + "sk2": "TTS Speaker 2", + "sk3": "TTS Speaker 3", + "sk4": "TTS Speaker 4", + "sk5": "TTS Speaker 5", + "sk6": "TTS Speaker 6", + "sk7": "TTS Speaker 7", + "sk8": "TTS Speaker 8", + "sk9": "TTS Speaker 9", + "sk10": "TTS Speaker 10", + "sk11": "TTS Speaker 11", + "sk12": "TTS Speaker 12", + "vc_title": "Voice Imitation in Different Languages", + "vc_subtitle": """ + ### Replicate a person's voice across various languages. + While effective with most voices when used appropriately, it may not achieve perfection in every case. + Voice Imitation solely replicates the reference speaker's tone, excluding accent and emotion, which are governed by the base speaker TTS model and not replicated by the converter. + This will take audio samples from the main audio for each speaker and process them. + """, + "vc_active_label": "Active Voice Imitation", + "vc_active_info": "Active Voice Imitation: Replicates the original speaker's tone", + "vc_method_label": "Method", + "vc_method_info": "Select a method for Voice Imitation process", + "vc_segments_label": "Max samples", + "vc_segments_info": "Max samples: Is the number of audio samples that will be generated for the process, more is better but it can add noise", + "vc_dereverb_label": "Dereverb", + "vc_dereverb_info": "Dereverb: Applies vocal dereverb to the audio samples.", + "vc_remove_label": "Remove previous samples", + "vc_remove_info": "Remove previous samples: Remove the previous samples generated, so new ones need to be created.", + "xtts_title": "Create a TTS based on an audio", + "xtts_subtitle": "Upload an audio file of maximum 10 seconds with a voice. Using XTTS, a new TTS will be created with a voice similar to the provided audio file.", + "xtts_file_label": "Upload a short audio with the voice", + "xtts_name_label": "Name for the TTS", + "xtts_name_info": "Use a simple name", + "xtts_dereverb_label": "Dereverb audio", + "xtts_dereverb_info": "Dereverb audio: Applies vocal dereverb to the audio", + "xtts_button": "Process the audio and include it in the TTS selector", + "xtts_footer": "Generate voice xtts automatically: You can use `_XTTS_/AUTOMATIC.wav` in the TTS selector to automatically generate segments for each speaker when generating the translation.", + "extra_setting": "Advanced Settings", + "acc_max_label": "Max Audio acceleration", + "acc_max_info": "Maximum acceleration for translated audio segments to avoid overlapping. A value of 1.0 represents no acceleration", + "acc_rate_label": "Acceleration Rate Regulation", + "acc_rate_info": "Acceleration Rate Regulation: Adjusts acceleration to accommodate segments requiring less speed, maintaining continuity and considering next-start timing.", + "or_label": "Overlap Reduction", + "or_info": "Overlap Reduction: Ensures segments don't overlap by adjusting start times based on previous end times; could disrupt synchronization.", + "aud_mix_label": "Audio Mixing Method", + "aud_mix_info": "Mix original and translated audio files to create a customized, balanced output with two available mixing modes.", + "vol_ori": "Volume original audio", + "vol_tra": "Volume translated audio", + "voiceless_tk_label": "Voiceless Track", + "voiceless_tk_info": "Voiceless Track: Remove the original audio voices before combining it with the translated audio.", + "sub_type": "Subtitle type", + "soft_subs_label": "Soft Subtitles", + "soft_subs_info": "Soft Subtitles: Optional subtitles that viewers can turn on or off while watching the video.", + "burn_subs_label": "Burn Subtitles", + "burn_subs_info": "Burn Subtitles: Embed subtitles into the video, making them a permanent part of the visual content.", + "whisper_title": "Config transcription.", + "lnum_label": "Literalize Numbers", + "lnum_info": "Literalize Numbers: Replace numerical representations with their written equivalents in the transcript.", + "scle_label": "Sound Cleanup", + "scle_info": "Sound Cleanup: Enhance vocals, remove background noise before transcription for utmost timestamp precision. This operation may take time, especially with lengthy audio files.", + "sd_limit_label": "Segment Duration Limit", + "sd_limit_info": "Specify the maximum duration (in seconds) for each segment. The audio will be processed using VAD, limiting the duration for each segment chunk.", + "asr_model_info": "It converts spoken language to text using the 'Whisper model' by default. Use a custom model, for example, by inputting the repository name 'BELLE-2/Belle-whisper-large-v3-zh' in the dropdown to utilize a Chinese language finetuned model. Find finetuned models on Hugging Face.", + "ctype_label": "Compute type", + "ctype_info": "Choosing smaller types like int8 or float16 can improve performance by reducing memory usage and increasing computational throughput, but may sacrifice precision compared to larger data types like float32.", + "batchz_label": "Batch size", + "batchz_info": "Reducing the batch size saves memory if your GPU has less VRAM and helps manage Out of Memory issues.", + "tsscale_label": "Text Segmentation Scale", + "tsscale_info": "Divide text into segments by sentences, words, or characters. Word and character segmentation offer finer granularity, useful for subtitles; disabling translation preserves original structure.", + "srt_file_label": "Upload an SRT subtitle file (will be used instead of the transcription of Whisper)", + "divide_text_label": "Redivide text segments by:", + "divide_text_info": "(Experimental) Enter a separator to split existing text segments in the source language. The tool will identify occurrences and create new segments accordingly. Specify multiple separators using |, e.g.: !|?|...|。", + "diarization_label": "Diarization model", + "tr_process_label": "Translation process", + "out_type_label": "Output type", + "out_name_label": "File name", + "out_name_info": "The name of the output file", + "task_sound_label": "Task Status Sound", + "task_sound_info": "Task Status Sound: Plays a sound alert indicating task completion or errors during execution.", + "cache_label": "Retrieve Progress", + "cache_info": "Retrieve Progress: Continue process from last checkpoint.", + "preview_info": "Preview cuts the video to only 10 seconds for testing purposes. Please deactivate it to retrieve the full video duration.", + "edit_sub_label": "Edit generated subtitles", + "edit_sub_info": "Edit generated subtitles: Allows you to run the translation in 2 steps. First with the 'GET SUBTITLES AND EDIT' button, you get the subtitles to edit them, and then with the 'TRANSLATE' button, you can generate the video", + "button_subs": "GET SUBTITLES AND EDIT", + "editor_sub_label": "Generated subtitles", + "editor_sub_info": "Feel free to edit the text in the generated subtitles here. You can make changes to the interface options before clicking the 'TRANSLATE' button, except for 'Source language', 'Translate audio to', and 'Max speakers', to avoid errors. Once you're finished, click the 'TRANSLATE' button.", + "editor_sub_ph": "First press 'GET SUBTITLES AND EDIT' to get the subtitles", + "button_translate": "TRANSLATE", + "output_result_label": "DOWNLOAD TRANSLATED VIDEO", + "sub_ori": "Subtitles", + "sub_tra": "Translated subtitles", + "ht_token_info": "One important step is to accept the license agreement for using Pyannote. You need to have an account on Hugging Face and accept the license to use the models: https://huggingface.co/pyannote/speaker-diarization and https://huggingface.co/pyannote/segmentation. Get your KEY TOKEN here: https://hf.co/settings/tokens", + "ht_token_ph": "Token goes here...", + "tab_docs": "Document translation", + "docs_input_label": "Choose Document Source", + "docs_input_info": "It can be PDF, DOCX, TXT, or text", + "docs_source_info": "This is the original language of the text", + "chunk_size_label": "Max number of characters that the TTS will process per segment", + "chunk_size_info": "A value of 0 assigns a dynamic and more compatible value for the TTS.", + "docs_button": "Start Language Conversion Bridge", + "cv_url_info": "Automatically download the R.V.C. models from the URL. You can use links from HuggingFace or Drive, and you can include several links, each one separated by a comma. Example: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Replace voice: TTS to R.V.C.", + "sec1_title": "### 1. To enable its use, mark it as enable.", + "enable_replace": "Check this to enable the use of the models.", + "sec2_title": "### 2. Select a voice that will be applied to each TTS of each corresponding speaker and apply the configurations.", + "sec2_subtitle": "Depending on how many you will use, each one needs its respective model. Additionally, there is an auxiliary one if for some reason the speaker is not detected correctly.", + "cv_tts1": "Choose the voice to apply for Speaker 1.", + "cv_tts2": "Choose the voice to apply for Speaker 2.", + "cv_tts3": "Choose the voice to apply for Speaker 3.", + "cv_tts4": "Choose the voice to apply for Speaker 4.", + "cv_tts5": "Choose the voice to apply for Speaker 5.", + "cv_tts6": "Choose the voice to apply for Speaker 6.", + "cv_tts7": "Choose the voice to apply for Speaker 7.", + "cv_tts8": "Choose the voice to apply for Speaker 8.", + "cv_tts9": "Choose the voice to apply for Speaker 9.", + "cv_tts10": "Choose the voice to apply for Speaker 10.", + "cv_tts11": "Choose the voice to apply for Speaker 11.", + "cv_tts12": "Choose the voice to apply for Speaker 12.", + "cv_aux": "- Voice to apply in case a Speaker is not detected successfully.", + "cv_button_apply": "APPLY CONFIGURATION", + "tab_help": "Help", + }, + "spanish": { + "description": """ + ### 🎥 **¡Traduce videos fácilmente con SoniTranslate!** 📽️ + + Sube un video, audio o proporciona un enlace de YouTube. 📽️ **Obtén el cuaderno actualizado desde el repositorio oficial: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Consulta la pestaña `Ayuda` para obtener instrucciones sobre cómo usarlo. ¡Comencemos a divertirnos con la traducción de videos! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Instrucciones de uso:** + + 1. 📤 Sube un archivo de **video**, **audio** o proporciona un enlace de 🌐 **YouTube**. + + 2. 🌍 Elige el idioma en el que deseas **traducir el video**. + + 3. 🗣️ Especifica el **número de personas que hablan** en el video y **asigna a cada una una voz de texto a voz** adecuada para el idioma de traducción. + + 4. 🚀 Presiona el botón '**Traducir**' para obtener los resultados. + + --- + + # 🧩 **SoniTranslate admite diferentes motores de TTS (Texto a Voz), los cuales son:** + - EDGE-TTS → formato `en-AU-WilliamNeural-Male` → Rapidos y precisos. + - FACEBOOK MMS → formato `en-facebook-mms VITS` → Voz más natural, por el momento solo usa CPU. + - PIPER TTS → formato `en_US-lessac-high VITS-onnx` → Igual que el anterior, pero está optimizado tanto para CPU como para GPU. + - BARK → formato `en_speaker_0-Male BARK` → De buena calidad pero lento y propenso a alucinaciones. + - OpenAI TTS → formato `>alloy OpenAI-TTS` → Multilingüe pero necesita una OpenAI API key. + - Coqui XTTS → formato `_XTTS_/AUTOMATIC.wav` → Solo disponible para Chinese (Simplified), English, French, German, Italian, Portuguese, Polish, Turkish, Russian, Dutch, Czech, Arabic, Spanish, Hungarian, Korean y Japanese. + + --- + + # 🎤 Cómo usar las voces R.V.C. y R.V.C.2 (Opcional) 🎶 + + El objetivo es aplicar un R.V.C. al TTS (Texto a Voz) generado 🎙️ + + 1. En la pestaña `Voz Personalizada R.V.C.`, descarga los modelos que necesitas 📥 Puedes utilizar enlaces de Hugging Face y Google Drive en formatos como zip, pth o index. También puedes descargar repositorios completos de espacio HF, pero esta opción no es muy estable 😕 + + 2. Ahora, ve a `Reemplazar voz: TTS a R.V.C.` y marca la casilla `habilitar` ✅ Después de esto, puedes elegir los modelos que deseas aplicar a cada hablante de TTS 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Ajusta el método F0 que se aplicará a todos los R.V.C. 🎛️ + + 4. Presiona `APLICAR CONFIGURACIÓN` para aplicar los cambios que hayas realizado 🔄 + + 5. Vuelve a la pestaña de traducción de video y haz clic en 'Traducir' ▶️ Ahora, la traducción se realizará aplicando el R.V.C. 🗣️ + + Consejo: Puedes usar `Probar R.V.C.` para experimentar y encontrar el mejor TTS o configuraciones para aplicar al R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "Traducción de video", + "video_source": "Seleccionar Fuente de Video", + "link_label": "URL del video.", + "link_info": "Ejemplo: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "Ingrese la URL aquí...", + "dir_label": "Ubicación del video.", + "dir_info": "Ejemplo: /usr/home/my_video.mp4", + "dir_ph": "Ingrese la ruta aquí...", + "sl_label": "Idioma de origen", + "sl_info": "Este es el idioma original del video", + "tat_label": "Traducir audio a", + "tat_info": "Seleccione el idioma de destino y asegúrese también de seleccionar los TTS correspondientes a ese lenguaje.", + "num_speakers": "Seleccione cuántas personas están hablando en el video.", + "min_sk": "Mín. de hablantes", + "max_sk": "Máx. de hablantes", + "tts_select": "Seleccione la voz que desea para cada hablante.", + "sk1": "TTS Hablante 1", + "sk2": "TTS Hablante 2", + "sk3": "TTS Hablante 3", + "sk4": "TTS Hablante 4", + "sk5": "TTS Hablante 5", + "sk6": "TTS Hablante 6", + "sk7": "TTS Hablante 7", + "sk8": "TTS Hablante 8", + "sk9": "TTS Hablante 9", + "sk10": "TTS Hablante 10", + "sk11": "TTS Hablante 11", + "sk12": "TTS Hablante 12", + "vc_title": "Imitación de voz en diferentes idiomas", + "vc_subtitle": """ + ### Replicar la voz de una persona en varios idiomas. + Si bien es efectiva con la mayoría de las voces cuando se usa adecuadamente, puede no alcanzar la perfección en todos los casos. + La imitación de voz solo replica el tono del hablante de referencia, excluyendo el acento y la emoción, que son controlados por el modelo TTS del hablante base y no son replicados por el convertidor. + Esto tomará muestras de audio del audio principal para cada hablante y las procesará. + """, + "vc_active_label": "Imitación de voz activa", + "vc_active_info": "Imitación de voz activa: Replica el tono del hablante original", + "vc_method_label": "Método", + "vc_method_info": "Selecciona un método para el proceso de imitación de voz", + "vc_segments_label": "Máximo de muestras", + "vc_segments_info": "Máximo de muestras: Es el número de muestras de audio que se generarán para el proceso, más es mejor pero puede agregar ruido", + "vc_dereverb_label": "Dereverberación", + "vc_dereverb_info": "Dereverberación: Aplica la dereverberación vocal a las muestras de audio.", + "vc_remove_label": "Eliminar muestras anteriores", + "vc_remove_info": "Eliminar muestras anteriores: Elimina las muestras generadas anteriormente, por lo que es necesario crear nuevas.", + "xtts_title": "Crear un TTS basado en un audio", + "xtts_subtitle": "Sube un archivo de audio de máximo 10 segundos con una voz. Utilizando XTTS, se creará un nuevo TTS con una voz similar al archivo de audio proporcionado.", + "xtts_file_label": "Subir un breve audio con la voz", + "xtts_name_label": "Nombre para el TTS", + "xtts_name_info": "Usa un nombre sencillo", + "xtts_dereverb_label": "Dereverberación del audio", + "xtts_dereverb_info": "Dereverberación del audio: Aplica la dereverberación vocal al audio", + "xtts_button": "Procesar el audio e incluirlo en el selector de TTS", + "xtts_footer": "Generar voz XTTS automáticamente: Puedes usar `_XTTS_/AUTOMATIC.wav` en el selector de TTS para generar automáticamente segmentos para cada hablante al generar la traducción.", + "extra_setting": "Configuraciones Avanzadas", + "acc_max_label": "Máx. de Aceleración de Audio", + "acc_max_info": "Aceleración máxima para segmentos de audio traducidos para evitar superposiciones. Un valor de 1.0 representa ninguna aceleración.", + "acc_rate_label": "Regulación de la Tasa de Aceleración", + "acc_rate_info": "Regulación de la Tasa de Aceleración: Ajusta la aceleración para adaptarse a segmentos que requieren menos velocidad, manteniendo la continuidad y considerando el momento de inicio siguiente.", + "or_label": "Reducción de superposición", + "or_info": "Reducción de superposición: Asegura que los segmentos no se superpongan ajustando los tiempos de inicio en función de los tiempos de finalización anteriores; podría interrumpir la sincronización.", + "aud_mix_label": "Método de Mezcla de Audio", + "aud_mix_info": "Mezclar archivos de audio original y traducido para crear una salida personalizada y equilibrada con dos modos de mezcla disponibles.", + "vol_ori": "Volumen audio original", + "vol_tra": "Volumen audio traducido", + "voiceless_tk_label": "Pista sin voz", + "voiceless_tk_info": "Pista sin voz: Elimina las voces originales del audio antes de combinarlo con el audio traducido.", + "sub_type": "Tipo de Subtítulos", + "soft_subs_label": "Subtítulos Suaves", + "soft_subs_info": "Subtítulos Suaves: Subtítulos opcionales que los espectadores pueden activar o desactivar mientras ven el video.", + "burn_subs_label": "Grabar subtítulos", + "burn_subs_info": "Grabar subtítulos: Incrusta los subtítulos en el video, convirtiéndolos en una parte permanente del contenido visual.", + "whisper_title": "Configuracion Transcripción.", + "lnum_label": "Literalizar Números", + "lnum_info": "Literalizar Números: Reemplazar representaciones numéricas con sus equivalentes escritos en la transcripción.", + "scle_label": "Limpieza de Sonido", + "scle_info": "Limpieza de Sonido: Mejora de vocales, elimina ruido de fondo antes de la transcripción para una precisión máxima en la marca de tiempo. Esta operación puede tomar tiempo, especialmente con archivos de audio extensos.", + "sd_limit_label": "Límite de Duración del Segmento", + "sd_limit_info": "Especifique la duración máxima (en segundos) para cada segmento. El audio se procesará utilizando VAD, limitando la duración para cada fragmento de segmento.", + "asr_model_info": "Convierte el lenguaje hablado a texto utilizando el modelo 'Whisper' de forma predeterminada. Utilice un modelo personalizado, por ejemplo, ingresando el nombre del repositorio 'BELLE-2/Belle-whisper-large-v3-zh' en el menú desplegable para utilizar un modelo en chino preajustado. Encuentre modelos preajustados en Hugging Face.", + "ctype_label": "Tipo de Cálculo", + "ctype_info": "Elegir tipos más pequeños como int8 o float16 puede mejorar el rendimiento al reducir el uso de memoria y aumentar el rendimiento computacional, pero puede sacrificar precisión en comparación con tipos de datos más grandes como float32.", + "batchz_label": "Tamaño del Lote", + "batchz_info": "Reducir el tamaño del lote ahorra memoria si su GPU tiene menos VRAM y ayuda a gestionar problemas de falta de memoria.", + "tsscale_label": "Escala de Segmentación de Texto", + "tsscale_info": "Divide el texto en segmentos por oraciones, palabras o caracteres. La segmentación por palabras y caracteres ofrece una granularidad más fina, útil para subtítulos; desactivar la traducción conserva la estructura original.", + "srt_file_label": "Subir un archivo de subtítulos SRT (Se utilizará en lugar de la transcripción de Whisper)", + "divide_text_label": "Redividir segmentos de texto por:", + "divide_text_info": "(Experimental) Ingresa un separador para dividir los segmentos de texto existentes en el idioma origen. La herramienta identificará las ocurrencias y creará nuevos segmentos en consecuencia. Especifica múltiples separadores usando |, por ejemplo: !|?|...|。", + "diarization_label": "Modelo de diarización", + "tr_process_label": "Proceso de traducción", + "out_type_label": "Tipo de salida", + "out_name_label": "Nombre del archivo", + "out_name_info": "El nombre del archivo de salida", + "task_sound_label": "Sonido de estado de la tarea", + "task_sound_info": "Sonido de estado de la tarea: Reproduce una alerta de sonido que indica la finalización de la tarea o errores durante la ejecución.", + "cache_label": "Recuperar Progreso", + "cache_info": "Recuperar Progreso: Continuar proceso desde el último punto de control.", + "preview_info": "La vista previa corta el video a solo 10 segundos con fines de prueba. Desactívelo para obtener la duración completa del video.", + "edit_sub_label": "Editar subtítulos generados", + "edit_sub_info": "Editar subtítulos generados: Permite ejecutar la traducción en 2 pasos. Primero, con el botón 'OBTENER SUBTÍTULOS Y EDITAR', obtiene los subtítulos para editarlos, y luego con el botón 'TRADUCIR', puede generar el video.", + "button_subs": "OBTENER SUBTÍTULOS Y EDITAR", + "editor_sub_label": "Subtítulos generados", + "editor_sub_info": "Siéntase libre de editar el texto de los subtítulos generados aquí. Puede realizar cambios en las opciones de la interfaz antes de hacer clic en el botón 'TRADUCIR', excepto en 'Idioma de origen', 'Traducir audio a' y 'Máx. de hablantes', para evitar errores. Una vez que haya terminado, haga clic en el botón 'TRADUCIR'.", + "editor_sub_ph": "Presione primero 'OBTENER SUBTÍTULOS Y EDITAR' para obtener los subtítulos", + "button_translate": "TRADUCIR", + "output_result_label": "DESCARGAR VIDEO TRADUCIDO", + "sub_ori": "Subtítulos originales", + "sub_tra": "Subtítulos traducidos", + "ht_token_info": "Un paso importante es aceptar el acuerdo de licencia para usar Pyannote. Debe tener una cuenta en Hugging Face y aceptar la licencia para usar los modelos: https://huggingface.co/pyannote/speaker-diarization y https://huggingface.co/pyannote/segmentation. Obtenga su TOKEN aquí: https://hf.co/settings/tokens", + "ht_token_ph": "Ingrese el token aquí...", + "tab_docs": "Traducción de documento", + "docs_input_label": "Elegir origen del documento", + "docs_input_info": "Puede ser PDF, DOCX, TXT o texto", + "docs_source_info": "Este es el idioma original del texto", + "chunk_size_label": "Máximo numero de caracteres que el TTS procesará por segmento.", + "chunk_size_info": "Un valor de 0 signa un valor dinámico y mejor combatible con el TTS.", + "docs_button": "Iniciar Puente de Conversión de Idioma", + "cv_url_info": "Descargue automáticamente los modelos R.V.C. desde la URL. Puede utilizar enlaces de HuggingFace o Drive, e incluso puede incluir varios enlaces, cada uno separado por una coma. Ejemplo: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Reemplazar voz: TTS a R.V.C.", + "sec1_title": "### 1. Para habilitar su uso, márquelo como habilitado.", + "enable_replace": "Marque esto para habilitar el uso de los modelos.", + "sec2_title": "### 2. Seleccione una voz que se aplicará a cada TTS de cada hablante correspondiente y aplique las configuraciones.", + "sec2_subtitle": "Dependiendo de cuántos vaya a usar, cada uno necesita su respectivo modelo. Además, hay uno auxiliar si por alguna razón el hablante no es detectado correctamente.", + "cv_tts1": "Voz a aplicar al TTS Hablante 1.", + "cv_tts2": "Voz a aplicar al TTS Hablante 2.", + "cv_tts3": "Voz a aplicar al TTS Hablante 3.", + "cv_tts4": "Voz a aplicar al TTS Hablante 4.", + "cv_tts5": "Voz a aplicar al TTS Hablante 5.", + "cv_tts6": "Voz a aplicar al TTS Hablante 6.", + "cv_tts7": "Voz a aplicar al TTS Hablante 7.", + "cv_tts8": "Voz a aplicar al TTS Hablante 8.", + "cv_tts9": "Voz a aplicar al TTS Hablante 9.", + "cv_tts10": "Voz a aplicar al TTS Hablante 10.", + "cv_tts11": "Voz a aplicar al TTS Hablante 11.", + "cv_tts12": "Voz a aplicar al TTS Hablante 12.", + "cv_aux": "- Voz a aplicar en caso de que un hablante no sea detectado correctamente.", + "cv_button_apply": "APLICAR CONFIGURACIÓN", + "tab_help": "Ayuda", + }, + "french": { + "description": """ + ### 🎥 **Traduisez facilement les vidéos avec SoniTranslate !** 📽️ + + Téléchargez une vidéo, un fichier audio ou fournissez un lien YouTube. 📽️ **Obtenez le notebook mis à jour à partir du référentiel officiel : [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Consultez l'onglet `Aide` pour des instructions sur son utilisation. Amusons-nous à traduire des vidéos ! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Instructions d'utilisation :** + + 1. 📤 Téléchargez une **vidéo**, un **fichier audio** ou fournissez un lien 🌐 **YouTube**. + + 2. 🌍 Choisissez la langue dans laquelle vous souhaitez **traduire la vidéo**. + + 3. 🗣️ Spécifiez le **nombre de personnes parlant** dans la vidéo et **attribuez à chacune une voix de synthèse textuelle** adaptée à la langue de traduction. + + 4. 🚀 Appuyez sur le bouton '**Traduire**' pour obtenir les résultats. + + --- + + # 🧩 **SoniTranslate prend en charge différents moteurs TTS (Text-to-Speech), à savoir :** + - EDGE-TTS → format `en-AU-WilliamNeural-Male` → Rapide et précis. + - FACEBOOK MMS → format `en-facebook-mms VITS` → La voix est plus naturelle ; pour le moment, il utilise uniquement le CPU. + - PIPER TTS → format `en_US-lessac-high VITS-onnx` → Identique au précédent, mais optimisé pour le CPU et le GPU. + - BARK → format `en_speaker_0-Male BARK` → Bonne qualité mais lent, et sujet aux hallucinations. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Multilingue mais nécessite une OpenAI API key. + - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Disponible uniquement pour le chinois (simplifié), l'anglais, le français, l'allemand, l'italien, le portugais, le polonais, le turc, le russe, le néerlandais, le tchèque, l'arabe, l'espagnol, le hongrois, le coréen et le japonais. + + --- + + # 🎤 Comment utiliser les voix R.V.C. et R.V.C.2 (Facultatif) 🎶 + + L'objectif est d'appliquer un R.V.C. à la TTS (Text-to-Speech) générée 🎙️ + + 1. Dans l'onglet `Voix personnalisée R.V.C.`, téléchargez les modèles dont vous avez besoin 📥 Vous pouvez utiliser des liens depuis Hugging Face et Google Drive dans des formats tels que zip, pth, ou index. Vous pouvez également télécharger des dépôts complets de l'espace HF, mais cette option n'est pas très stable 😕 + + 2. Allez maintenant dans `Remplacer la voix : TTS par R.V.C.` et cochez la case `activer` ✅ Ensuite, vous pouvez choisir les modèles que vous souhaitez appliquer à chaque locuteur TTS 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Ajustez la méthode F0 qui sera appliquée à tous les R.V.C. 🎛️ + + 4. Appuyez sur `APPLIQUER LA CONFIGURATION` pour appliquer les modifications que vous avez apportées 🔄 + + 5. Retournez à l'onglet de traduction vidéo et cliquez sur 'Traduire' ▶️ Maintenant, la traduction se fera en appliquant le R.V.C. 🗣️ + + Astuce : Vous pouvez utiliser `Test R.V.C.` pour expérimenter et trouver les meilleures TTS ou configurations à appliquer au R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "Traduction vidéo", + "video_source": "Choisir la source vidéo", + "link_label": "Lien multimédia.", + "link_info": "Exemple : www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "L'URL va ici...", + "dir_label": "Chemin de la vidéo.", + "dir_info": "Exemple : /usr/home/ma_video.mp4", + "dir_ph": "Le chemin va ici...", + "sl_label": "Langue source", + "sl_info": "Il s'agit de la langue d'origine de la vidéo", + "tat_label": "Traduire l'audio en", + "tat_info": "Sélectionnez la langue cible et assurez-vous également de choisir le TTS correspondant pour cette langue.", + "num_speakers": "Sélectionnez combien de personnes parlent dans la vidéo.", + "min_sk": "Locuteurs min", + "max_sk": "Locuteurs max", + "tts_select": "Sélectionnez la voix que vous souhaitez pour chaque locuteur.", + "sk1": "Locuteur TTS 1", + "sk2": "Locuteur TTS 2", + "sk3": "Locuteur TTS 3", + "sk4": "Locuteur TTS 4", + "sk5": "Locuteur TTS 5", + "sk6": "Locuteur TTS 6", + "sk7": "Locuteur TTS 7", + "sk8": "Locuteur TTS 8", + "sk9": "Locuteur TTS 9", + "sk10": "Locuteur TTS 10", + "sk11": "Locuteur TTS 11", + "sk12": "Locuteur TTS 12", + "vc_title": "Imitation de voix dans différentes langues", + "vc_subtitle": """ + ### Répliquez la voix d'une personne dans différentes langues. + Bien que efficace avec la plupart des voix lorsqu'il est utilisé correctement, cela peut ne pas atteindre la perfection dans tous les cas. + L'imitation de voix ne reproduit que le ton du locuteur de référence, excluant l'accent et l'émotion, qui sont régis par le modèle TTS du locuteur de base et non reproduits par le convertisseur. + Cela prendra des échantillons audio de l'audio principal pour chaque locuteur et les traitera. + """, + "vc_active_label": "Imitation de voix active", + "vc_active_info": "Imitation de voix active : Reproduit le ton du locuteur original", + "vc_method_label": "Méthode", + "vc_method_info": "Sélectionnez une méthode pour le processus d'imitation de voix", + "vc_segments_label": "Échantillons max", + "vc_segments_info": "Échantillons max : Nombre d'échantillons audio qui seront générés pour le processus, plus il y en a, mieux c'est, mais cela peut ajouter du bruit", + "vc_dereverb_label": "Déréverbération", + "vc_dereverb_info": "Déréverbération : Applique une déréverbération vocale aux échantillons audio.", + "vc_remove_label": "Supprimer les échantillons précédents", + "vc_remove_info": "Supprimer les échantillons précédents : Supprime les échantillons précédents générés, de sorte que de nouveaux doivent être créés.", + "xtts_title": "Créer un TTS basé sur un audio", + "xtts_subtitle": "Téléchargez un fichier audio d'une durée maximale de 10 secondes avec une voix. En utilisant XTTS, un nouveau TTS sera créé avec une voix similaire au fichier audio fourni.", + "xtts_file_label": "Télécharger un court audio avec la voix", + "xtts_name_label": "Nom pour le TTS", + "xtts_name_info": "Utilisez un nom simple", + "xtts_dereverb_label": "Déréverbération de l'audio", + "xtts_dereverb_info": "Déréverbération de l'audio : Applique une déréverbération vocale à l'audio", + "xtts_button": "Traiter l'audio et l'inclure dans le sélecteur TTS", + "xtts_footer": "Générer automatiquement un TTS vocal : Vous pouvez utiliser `_XTTS_/AUTOMATIC.wav` dans le sélecteur TTS pour générer automatiquement des segments pour chaque locuteur lors de la génération de la traduction.", + "extra_setting": "Paramètres avancés", + "acc_max_label": "Accélération audio max", + "acc_max_info": "Accélération maximale pour les segments audio traduits afin d'éviter les chevauchements. Une valeur de 1,0 représente aucune accélération", + "acc_rate_label": "Régulation du taux d'accélération", + "acc_rate_info": "Régulation du taux d'accélération : Ajuste l'accélération pour prendre en compte les segments nécessitant moins de vitesse, en maintenant la continuité et en tenant compte du timing du prochain démarrage.", + "or_label": "Réduction des chevauchements", + "or_info": "Réduction des chevauchements : Garantit que les segments ne se chevauchent pas en ajustant les heures de début en fonction des heures de fin précédentes ; pourrait perturber la synchronisation.", + "aud_mix_label": "Méthode de mixage audio", + "aud_mix_info": "Mixer les fichiers audio original et traduit pour créer une sortie équilibrée et personnalisée avec deux modes de mixage disponibles.", + "vol_ori": "Volume audio original", + "vol_tra": "Volume audio traduit", + "voiceless_tk_label": "Piste sans voix", + "voiceless_tk_info": "Piste sans voix : Supprime les voix audio originales avant de les combiner avec l'audio traduit.", + "sub_type": "Type de sous-titres", + "soft_subs_label": "Sous-titres souples", + "soft_subs_info": "Sous-titres souples : Sous-titres facultatifs que les spectateurs peuvent activer ou désactiver pendant le visionnage de la vidéo.", + "burn_subs_label": "Incorporer les sous-titres", + "burn_subs_info": "Incorporer les sous-titres : Intégrer les sous-titres dans la vidéo, les rendant ainsi une partie permanente du contenu visuel.", + "whisper_title": "Config transcription.", + "lnum_label": "Literaliser les Nombres", + "lnum_info": "Literaliser les Nombres: Remplacer les représentations numériques par leurs équivalents écrits dans la transcription.", + "scle_label": "Nettoyage du Son", + "scle_info": "Nettoyage du Son: Amélioration des voix, suppression du bruit de fond avant la transcription pour une précision maximale des horodatages. Cette opération peut prendre du temps, notamment avec des fichiers audio volumineux.", + "sd_limit_label": "Limite de Durée du Segment", + "sd_limit_info": "Spécifiez la durée maximale (en secondes) pour chaque segment. L'audio sera traité en utilisant VAD, limitant la durée pour chaque fragment de segment.", + "asr_model_info": "Il convertit la langue parlée en texte en utilisant le modèle 'Whisper' par défaut. Utilisez un modèle personnalisé, par exemple, en saisissant le nom du référentiel 'BELLE-2/Belle-whisper-large-v3-zh' dans la liste déroulante pour utiliser un modèle chinois préajusté. Trouvez des modèles préajustés sur Hugging Face.", + "ctype_label": "Type de Calcul", + "ctype_info": "Choisir des types plus petits comme int8 ou float16 peut améliorer les performances en réduisant l'utilisation de la mémoire et en augmentant le débit computationnel, mais peut sacrifier la précision par rapport à des types de données plus grands comme float32.", + "batchz_label": "Taille du Lot", + "batchz_info": "Réduire la taille du lot permet d'économiser de la mémoire si votre GPU dispose de moins de VRAM et aide à gérer les problèmes de mémoire insuffisante.", + "tsscale_label": "Échelle de Segmentation de Texte", + "tsscale_info": "Divisez le texte en segments par phrases, mots ou caractères. La segmentation par mots et caractères offre une granularité plus fine, utile pour les sous-titres; désactiver la traduction conserve la structure d'origine.", + "srt_file_label": "Télécharger un fichier de sous-titres SRT (sera utilisé à la place de la transcription de Whisper)", + "divide_text_label": "Rediviser les segments de texte par :", + "divide_text_info": "(Expérimental) Entrez un séparateur pour diviser les segments de texte existants dans la langue source. L'outil identifiera les occurrences et créera de nouveaux segments en conséquence. Spécifiez plusieurs séparateurs en utilisant |, par ex. : !|?|...|。", + "diarization_label": "Modèle de diarisation", + "tr_process_label": "Processus de traduction", + "out_type_label": "Type de sortie", + "out_name_label": "Nom de fichier", + "out_name_info": "Le nom du fichier de sortie", + "task_sound_label": "Son d'état de la tâche", + "task_sound_info": "Son d'état de la tâche : Joue une alerte sonore indiquant la fin de la tâche ou les erreurs lors de l'exécution.", + "cache_label": "Récupération de la progression", + "cache_info": "Récupération de la progression : Continuer le processus depuis le dernier point de contrôle.", + "preview_info": "L'aperçu coupe la vidéo à seulement 10 secondes à des fins de test. Veuillez le désactiver pour récupérer la durée complète de la vidéo.", + "edit_sub_label": "Modifier les sous-titres générés", + "edit_sub_info": "Modifier les sous-titres générés : Vous permet d'exécuter la traduction en 2 étapes. Tout d'abord avec le bouton 'OBTENIR LES SOUS-TITRES ET ÉDITER', vous obtenez les sous-titres pour les éditer, puis avec le bouton 'TRADUIRE', vous pouvez générer la vidéo", + "button_subs": "OBTENIR LES SOUS-TITRES ET ÉDITER", + "editor_sub_label": "Sous-titres générés", + "editor_sub_info": "N'hésitez pas à éditer le texte dans les sous-titres générés ici. Vous pouvez apporter des modifications aux options d'interface avant de cliquer sur le bouton 'TRADUIRE', sauf pour 'Langue source', 'Traduire l'audio en' et 'Locuteurs max', pour éviter les erreurs. Une fois terminé, cliquez sur le bouton 'TRADUIRE'.", + "editor_sub_ph": "Appuyez d'abord sur 'OBTENIR LES SOUS-TITRES ET ÉDITER' pour obtenir les sous-titres", + "button_translate": "TRADUIRE", + "output_result_label": "TÉLÉCHARGER LA VIDÉO TRADUITE", + "sub_ori": "Sous-titres", + "sub_tra": "Sous-titres traduits", + "ht_token_info": "Une étape importante est d'accepter l'accord de licence pour utiliser Pyannote. Vous devez avoir un compte sur Hugging Face et accepter la licence pour utiliser les modèles : https://huggingface.co/pyannote/speaker-diarization et https://huggingface.co/pyannote/segmentation. Obtenez votre JETON CLÉ ici : https://hf.co/settings/tokens", + "ht_token_ph": "Le jeton va ici...", + "tab_docs": "Traduction de documents", + "docs_input_label": "Choisir la source du document", + "docs_input_info": "Il peut s'agir de PDF, DOCX, TXT ou texte", + "docs_source_info": "Il s'agit de la langue d'origine du texte", + "chunk_size_label": "Nombre maximal de caractères que le TTS traitera par segment", + "chunk_size_info": "Une valeur de 0 attribue une valeur dynamique et plus compatible pour le TTS.", + "docs_button": "Démarrer le pont de conversion de langue", + "cv_url_info": "Téléchargez automatiquement les modèles R.V.C. depuis l'URL. Vous pouvez utiliser des liens depuis HuggingFace ou Drive, et vous pouvez inclure plusieurs liens, chacun séparé par une virgule. Exemple : https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Remplacer la voix : TTS par R.V.C.", + "sec1_title": "### 1. Pour activer son utilisation, marquez-la comme activée.", + "enable_replace": "Cochez pour activer l'utilisation des modèles.", + "sec2_title": "### 2. Sélectionnez une voix qui sera appliquée à chaque TTS de chaque locuteur correspondant et appliquez les configurations.", + "sec2_subtitle": "En fonction du nombre de que vous utiliserez, chacun doit avoir son modèle respectif. De plus, il y a un auxiliaire si pour une raison quelconque le locuteur n'est pas détecté correctement.", + "cv_tts1": "Choisissez la voix à appliquer pour le Locuteur 1.", + "cv_tts2": "Choisissez la voix à appliquer pour le Locuteur 2.", + "cv_tts3": "Choisissez la voix à appliquer pour le Locuteur 3.", + "cv_tts4": "Choisissez la voix à appliquer pour le Locuteur 4.", + "cv_tts5": "Choisissez la voix à appliquer pour le Locuteur 5.", + "cv_tts6": "Choisissez la voix à appliquer pour le Locuteur 6.", + "cv_tts7": "Choisissez la voix à appliquer pour le Locuteur 7.", + "cv_tts8": "Choisissez la voix à appliquer pour le Locuteur 8.", + "cv_tts9": "Choisissez la voix à appliquer pour le Locuteur 9.", + "cv_tts10": "Choisissez la voix à appliquer pour le Locuteur 10.", + "cv_tts11": "Choisissez la voix à appliquer pour le Locuteur 11.", + "cv_tts12": "Choisissez la voix à appliquer pour le Locuteur 12.", + "cv_aux": "- Voix à appliquer en cas de détection incorrecte d'un locuteur.", + "cv_button_apply": "APPLIQUER LA CONFIGURATION", + "tab_help": "Aide", + }, + "german": { + "description": """ + ### 🎥 **Übersetzen Sie Videos einfach mit SoniTranslate!** 📽️ + + Laden Sie ein Video, eine Audiodatei hoch oder geben Sie einen YouTube-Link an. 📽️ **Holen Sie sich das aktualisierte Notizbuch aus dem offiziellen Repository: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Sehen Sie sich den Tab `Hilfe` für Anweisungen zur Verwendung an. Fangen wir an, Spaß beim Übersetzen von Videos zu haben! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Anleitung zur Verwendung:** + + 1. 📤 Laden Sie ein **Video**, eine **Audiodatei** hoch oder geben Sie einen 🌐 **YouTube-Link** an. + + 2. 🌍 Wählen Sie die Sprache aus, in die Sie das **Video übersetzen möchten**. + + 3. 🗣️ Geben Sie die **Anzahl der Sprecher im Video** an und **weisen Sie jedem einen Text-to-Speech-Stimme** zu, die für die Übersetzungssprache geeignet ist. + + 4. 🚀 Drücken Sie die Schaltfläche '**Übersetzen**', um die Ergebnisse zu erhalten. + + --- + + # 🧩 **SoniTranslate unterstützt verschiedene TTS (Text-to-Speech)-Engines, darunter:** + - EDGE-TTS → Format `en-AU-WilliamNeural-Male` → Schnell und präzise. + - FACEBOOK MMS → Format `en-facebook-mms VITS` → Die Stimme ist natürlicher; derzeit nur CPU. + - PIPER TTS → Format `en_US-lessac-high VITS-onnx` → Wie das vorherige, aber optimiert für CPU und GPU. + - BARK → Format `en_speaker_0-Male BARK` → Gute Qualität, aber langsam und anfällig für Halluzinationen. + - OpenAI TTS → Format `>alloy OpenAI-TTS` → Multisprachig, erfordert jedoch einen OpenAI API key + - Coqui XTTS → Format `_XTTS_/AUTOMATIC.wav` → Nur verfügbar für Chinesisch (vereinfacht), Englisch, Französisch, Deutsch, Italienisch, Portugiesisch, Polnisch, Türkisch, Russisch, Niederländisch, Tschechisch, Arabisch, Spanisch, Ungarisch, Koreanisch und Japanisch. + + --- + + # 🎤 So verwenden Sie R.V.C. und R.V.C.2 Stimmen (optional) 🎶 + + Das Ziel ist es, eine R.V.C. auf das generierte TTS (Text-to-Speech) anzuwenden 🎙️ + + 1. Laden Sie in der Registerkarte `Benutzerdefinierte Stimme R.V.C.` die Modelle herunter, die Sie benötigen 📥 Sie können Links von Hugging Face und Google Drive in Formaten wie zip, pth oder Index verwenden. Sie können auch komplette HF-Raum-Repositories herunterladen, aber diese Option ist nicht sehr stabil 😕 + + 2. Gehen Sie nun zu `Stimme ersetzen: TTS zu R.V.C.` und aktivieren Sie das Kontrollkästchen `aktivieren` ✅ Danach können Sie die Modelle auswählen, die Sie auf jeden TTS-Sprecher anwenden möchten 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Passen Sie die F0-Methode an, die auf alle R.V.C. angewendet wird. 🎛️ + + 4. Drücken Sie `KONFIGURATION ANWENDEN`, um die vorgenommenen Änderungen anzuwenden 🔄 + + 5. Gehen Sie zurück zum Tab für die Videoübersetzung und klicken Sie auf 'Übersetzen' ▶️ Jetzt wird die Übersetzung mit der R.V.C. angewendet. 🗣️ + + Tipp: Sie können `Test R.V.C.` verwenden, um zu experimentieren und die besten TTS oder Konfigurationen zu finden, die auf die R.V.C. angewendet werden sollen. 🧪🔍 + + --- + + """, + "tab_translate": "Videotranslation", + "video_source": "Wählen Sie die Videoquelle", + "link_label": "Medienlink.", + "link_info": "Beispiel: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL hier eingeben...", + "dir_label": "Videopfad.", + "dir_info": "Beispiel: /usr/home/my_video.mp4", + "dir_ph": "Pfad hier eingeben...", + "sl_label": "Ausgangssprache", + "sl_info": "Dies ist die Originalsprache des Videos", + "tat_label": "Audio übersetzen nach", + "tat_info": "Wählen Sie die Zielsprache aus und stellen Sie sicher, dass Sie die entsprechende TTS für diese Sprache auswählen.", + "num_speakers": "Wählen Sie, wie viele Personen im Video sprechen.", + "min_sk": "Min Sprecher", + "max_sk": "Max Sprecher", + "tts_select": "Wählen Sie die Stimme für jeden Sprecher aus.", + "sk1": "TTS-Sprecher 1", + "sk2": "TTS-Sprecher 2", + "sk3": "TTS-Sprecher 3", + "sk4": "TTS-Sprecher 4", + "sk5": "TTS-Sprecher 5", + "sk6": "TTS-Sprecher 6", + "sk7": "TTS-Sprecher 7", + "sk8": "TTS-Sprecher 8", + "sk9": "TTS-Sprecher 9", + "sk10": "TTS-Sprecher 10", + "sk11": "TTS-Sprecher 11", + "sk12": "TTS-Sprecher 12", + "vc_title": "Stimmenimitation in verschiedenen Sprachen", + "vc_subtitle": """ + ### Reproduzieren Sie die Stimme einer Person in verschiedenen Sprachen. + Obwohl es bei den meisten Stimmen wirksam ist, kann es nicht in jedem Fall perfekt sein. + Die Stimmenimitation repliziert ausschließlich den Ton des Referenzsprechers und schließt Akzent und Emotion aus, die durch das TTS-Modell des Basis-Sprechers gesteuert werden und nicht vom Konverter repliziert werden. + Es werden Audioaufnahmen aus dem Hauptaudio für jeden Sprecher entnommen und verarbeitet. + """, + "vc_active_label": "Aktive Stimmenimitation", + "vc_active_info": "Aktive Stimmenimitation: Reproduziert den Ton des Originalsprechers", + "vc_method_label": "Methode", + "vc_method_info": "Wählen Sie eine Methode für den Stimmenimitationsprozess aus", + "vc_segments_label": "Max Proben", + "vc_segments_info": "Max Proben: Ist die Anzahl der Audioaufnahmen, die für den Prozess generiert werden, mehr ist besser, aber es kann Lärm hinzufügen", + "vc_dereverb_label": "Dereverb", + "vc_dereverb_info": "Dereverb: Wendet vokalen Dereverb auf die Audioaufnahmen an.", + "vc_remove_label": "Vorherige Proben entfernen", + "vc_remove_info": "Vorherige Proben entfernen: Entfernt die zuvor generierten Proben, sodass neue erstellt werden müssen.", + "xtts_title": "Erstellen Sie ein TTS basierend auf einem Audio", + "xtts_subtitle": "Laden Sie eine Audiodatei von maximal 10 Sekunden mit einer Stimme hoch. Mit XTTS wird ein neues TTS mit einer Stimme ähnlich der bereitgestellten Audiodatei erstellt.", + "xtts_file_label": "Laden Sie eine kurze Audio mit der Stimme hoch", + "xtts_name_label": "Name für das TTS", + "xtts_name_info": "Verwenden Sie einen einfachen Namen", + "xtts_dereverb_label": "Dereverb-Audio", + "xtts_dereverb_info": "Dereverb-Audio: Wendet vokalen Dereverb auf die Audioaufnahme an", + "xtts_button": "Verarbeiten Sie das Audio und fügen Sie es dem TTS-Auswähler hinzu", + "xtts_footer": "Generieren Sie Stimme xtts automatisch: Sie können `_XTTS_/AUTOMATIC.wav` im TTS-Auswähler verwenden, um automatisch Segmente für jeden Sprecher zu generieren, wenn die Übersetzung generiert wird.", + "extra_setting": "Erweiterte Einstellungen", + "acc_max_label": "Max Audiobeschleunigung", + "acc_max_info": "Maximale Beschleunigung für übersetzte Audiosegmente, um Überlappungen zu vermeiden. Ein Wert von 1,0 repräsentiert keine Beschleunigung", + "acc_rate_label": "Beschleunigungsrate-Regelung", + "acc_rate_info": "Beschleunigungsrate-Regelung: Passt die Beschleunigung an, um Segmente mit weniger Geschwindigkeit anzupassen, um die Kontinuität zu erhalten und den Zeitpunkt des nächsten Starts zu berücksichtigen.", + "or_label": "Überlappungsreduzierung", + "or_info": "Überlappungsreduzierung: Stellt sicher, dass Segmente sich nicht überschneiden, indem Startzeiten auf Grundlage vorheriger Endzeiten angepasst werden; könnte die Synchronisierung stören.", + "aud_mix_label": "Audio-Mixing-Methode", + "aud_mix_info": "Mischen Sie Original- und übersetzte Audiodateien, um eine individuelle, ausgewogene Ausgabe mit zwei verfügbaren Mischmodi zu erstellen.", + "vol_ori": "Lautstärke des Originaltons", + "vol_tra": "Lautstärke des übersetzten Tons", + "voiceless_tk_label": "Stimmenloses Track", + "voiceless_tk_info": "Stimmenloses Track: Entfernen Sie die Original-Audio-Stimmen, bevor Sie sie mit dem übersetzten Audio kombinieren.", + "sub_type": "Untertiteltyp", + "soft_subs_label": "Weiche Untertitel", + "soft_subs_info": "Weiche Untertitel: Optionale Untertitel, die Zuschauer während des Videostreamings ein- oder ausschalten können.", + "burn_subs_label": "Untertitel einbetten", + "burn_subs_info": "Untertitel einbetten: Untertitel in das Video einbetten und somit zu einem festen Bestandteil des visuellen Inhalts machen.", + "whisper_title": "Konfiguration Transkription.", + "lnum_label": "Zahlen Literalisieren", + "lnum_info": "Zahlen Literalisieren: Ersetzen numerischer Darstellungen durch ihre geschriebenen Äquivalente in der Transkription.", + "scle_label": "Tonbereinigung", + "scle_info": "Tonbereinigung: Verbesserung der Stimme, Entfernen von Hintergrundgeräuschen vor der Transkription für maximale Zeitstempelgenauigkeit. Diese Operation kann Zeit in Anspruch nehmen, insbesondere bei längeren Audiodateien.", + "sd_limit_label": "Segmentdauerbegrenzung", + "sd_limit_info": "Geben Sie die maximale Dauer (in Sekunden) für jeden Abschnitt an. Der Ton wird unter Verwendung von VAD verarbeitet, wobei die Dauer für jeden Segmentabschnitt begrenzt wird.", + "asr_model_info": "Es wandelt gesprochene Sprache standardmäßig mit dem 'Whisper'-Modell in Text um. Verwenden Sie ein benutzerdefiniertes Modell, indem Sie beispielsweise den Repository-Namen 'BELLE-2/Belle-whisper-large-v3-zh' im Dropdown-Menü eingeben, um ein chinesisches Sprachmodell zu verwenden. Finden Sie feinabgestimmte Modelle auf Hugging Face.", + "ctype_label": "Berechnungstyp", + "ctype_info": "Die Auswahl kleinerer Typen wie int8 oder float16 kann die Leistung verbessern, indem der Speicherverbrauch reduziert und die Rechenleistung erhöht wird, kann jedoch im Vergleich zu größeren Datentypen wie float32 an Präzision verlieren.", + "batchz_label": "Batch-Größe", + "batchz_info": "Die Reduzierung der Batch-Größe spart Speicherplatz, wenn Ihre GPU weniger VRAM hat, und hilft bei der Verwaltung von Out-of-Memory-Problemen.", + "tsscale_label": "Textsegmentierungsskala", + "tsscale_info": "Teilen Sie den Text in Segmente nach Sätzen, Wörtern oder Zeichen auf. Die Segmentierung nach Wörtern und Zeichen bietet eine feinere Granularität, die für Untertitel nützlich ist. Das Deaktivieren der Übersetzung erhält die Originalstruktur.", + "srt_file_label": "Laden Sie eine SRT-Untertiteldatei hoch (wird anstelle der Transkription von Whisper verwendet)", + "divide_text_label": "Textsegmente neu aufteilen nach:", + "divide_text_info": "(Experimentell) Geben Sie einen Separator ein, um vorhandene Textsegmente in der Ausgangssprache aufzuteilen. Das Tool erkennt Vorkommen und erstellt entsprechend neue Segmente. Geben Sie mehrere Trennzeichen mit | an, z. B.: !|?|...|。", + "diarization_label": "Diarisierungsmodell", + "tr_process_label": "Übersetzungsprozess", + "out_type_label": "Ausgabetyp", + "out_name_label": "Dateiname", + "out_name_info": "Der Name der Ausgabedatei", + "task_sound_label": "Aufgabenstatus-Sound", + "task_sound_info": "Aufgabenstatus-Sound: Gibt einen akustischen Hinweis auf den Abschluss der Aufgabe oder Fehler während der Ausführung.", + "cache_label": "Fortschritt abrufen", + "cache_info": "Fortschritt abrufen: Fortfahren vom letzten Kontrollpunkt.", + "preview_info": "Die Vorschau schneidet das Video zu Testzwecken auf nur 10 Sekunden. Deaktivieren Sie es bitte, um die volle Videodauer abzurufen.", + "edit_sub_label": "Generierte Untertitel bearbeiten", + "edit_sub_info": "Generierte Untertitel bearbeiten: Ermöglicht Ihnen, die Übersetzung in 2 Schritten durchzuführen. Zuerst mit der Schaltfläche 'UNTERTEITEL BEKOMMEN UND BEARBEITEN' erhalten Sie die Untertitel, um sie zu bearbeiten, und dann mit der Schaltfläche 'ÜBERSETZEN' können Sie das Video generieren", + "button_subs": "UNTERTEITEL BEKOMMEN UND BEARBEITEN", + "editor_sub_label": "Generierte Untertitel", + "editor_sub_info": "Bearbeiten Sie den Text in den generierten Untertiteln hier. Sie können Änderungen an den Schnittstellenoptionen vornehmen, bevor Sie auf die Schaltfläche 'ÜBERSETZEN' klicken, außer 'Ausgangssprache', 'Audio übersetzen nach' und 'Max Sprecher', um Fehler zu vermeiden. Wenn Sie fertig sind, klicken Sie auf die Schaltfläche 'ÜBERSETZEN'.", + "editor_sub_ph": "Drücken Sie zuerst 'UNTERTEITEL BEKOMMEN UND BEARBEITEN', um die Untertitel zu erhalten", + "button_translate": "ÜBERSETZEN", + "output_result_label": "ÜBERSETZTES VIDEO HERUNTERLADEN", + "sub_ori": "Untertitel", + "sub_tra": "Übersetzte Untertitel", + "ht_token_info": "Ein wichtiger Schritt besteht darin, die Lizenzvereinbarung für die Verwendung von Pyannote zu akzeptieren. Sie müssen ein Konto bei Hugging Face haben und die Lizenz akzeptieren, um die Modelle zu verwenden: https://huggingface.co/pyannote/speaker-diarization und https://huggingface.co/pyannote/segmentation. Holen Sie sich hier Ihren SCHLÜSSELTOKEN: https://hf.co/settings/tokens", + "ht_token_ph": "Token hier eingeben...", + "tab_docs": "Dokumentübersetzung", + "docs_input_label": "Dokumentquelle auswählen", + "docs_input_info": "Es kann PDF, DOCX, TXT oder Text sein", + "docs_source_info": "Dies ist die Originalsprache des Textes", + "chunk_size_label": "Maximale Anzahl von Zeichen, die der TTS pro Segment verarbeiten soll", + "chunk_size_info": "Ein Wert von 0 weist einen dynamischen und kompatibleren Wert für den TTS zu.", + "docs_button": "Starten Sie die Sprachkonvertierung Bridge", + "cv_url_info": "Laden Sie die R.V.C.-Modelle automatisch von der URL herunter. Sie können Links von HuggingFace oder Drive verwenden und mehrere Links, jeweils durch ein Komma getrennt, einfügen. Beispiel: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Stimme ersetzen: TTS zu R.V.C.", + "sec1_title": "### 1. Um seine Verwendung zu aktivieren, markieren Sie es als aktiv.", + "enable_replace": "Aktivieren Sie dies, um die Verwendung der Modelle zu ermöglichen.", + "sec2_title": "### 2. Wählen Sie eine Stimme aus, die auf jeden TTS jedes entsprechenden Sprechers angewendet wird, und wenden Sie die Konfigurationen an.", + "sec2_subtitle": "Je nachdem, wie viele Sie verwenden werden, benötigt jeder sein entsprechendes Modell. Außerdem gibt es ein Hilfsmodell, falls der Sprecher aus irgendeinem Grund nicht korrekt erkannt wird.", + "cv_tts1": "Wählen Sie die Stimme für Sprecher 1 aus.", + "cv_tts2": "Wählen Sie die Stimme für Sprecher 2 aus.", + "cv_tts3": "Wählen Sie die Stimme für Sprecher 3 aus.", + "cv_tts4": "Wählen Sie die Stimme für Sprecher 4 aus.", + "cv_tts5": "Wählen Sie die Stimme für Sprecher 5 aus.", + "cv_tts6": "Wählen Sie die Stimme für Sprecher 6 aus.", + "cv_tts7": "Wählen Sie die Stimme für Sprecher 7 aus.", + "cv_tts8": "Wählen Sie die Stimme für Sprecher 8 aus.", + "cv_tts9": "Wählen Sie die Stimme für Sprecher 9 aus.", + "cv_tts10": "Wählen Sie die Stimme für Sprecher 10 aus.", + "cv_tts11": "Wählen Sie die Stimme für Sprecher 11 aus.", + "cv_tts12": "Wählen Sie die Stimme für Sprecher 12 aus.", + "cv_aux": "- Stimme, die angewendet wird, falls ein Sprecher nicht erfolgreich erkannt wird.", + "cv_button_apply": "KONFIGURATION ANWENDEN", + "tab_help": "Hilfe", + }, + "italian": { + "description": """ + ### 🎥 **Traduci i video facilmente con SoniTranslate!** 📽️ + + Carica un video, un file audio o fornisci un link YouTube. 📽️ **Ottieni il notebook aggiornato dal repository ufficiale: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Consulta la scheda `Aiuto` per istruzioni su come utilizzarlo. Iniziamo a divertirci con la traduzione dei video! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Istruzioni per l'uso:** + + 1. 📤 Carica un **video**, un **file audio** o fornisci un 🌐 **link YouTube**. + + 2. 🌍 Scegli la lingua in cui desideri **tradurre il video**. + + 3. 🗣️ Specifica il **numero di persone che parlano** nel video e **assegna a ciascuna una voce di sintesi vocale** adatta alla lingua di traduzione. + + 4. 🚀 Premi il pulsante '**Traduci**' per ottenere i risultati. + + --- + + # 🧩 **SoniTranslate supporta diversi motori TTS (Text-to-Speech), tra cui:** + - EDGE-TTS → formato `en-AU-WilliamNeural-Male` → Veloce e preciso. + - FACEBOOK MMS → formato `en-facebook-mms VITS` → La voce è più naturale; al momento utilizza solo la CPU. + - PIPER TTS → formato `en_US-lessac-high VITS-onnx` → Come il precedente, ma ottimizzato sia per CPU che GPU. + - BARK → formato `en_speaker_0-Male BARK` → Buona qualità ma lenta e soggetta ad allucinazioni. + - OpenAI TTS → formato `>alloy OpenAI-TTS` → Multilingue ma richiede una OpenAI API key. + - Coqui XTTS → formato `_XTTS_/AUTOMATIC.wav` → Disponibile solo per cinese (semplificato), inglese, francese, tedesco, italiano, portoghese, polacco, turco, russo, olandese, ceco, arabo, spagnolo, ungherese, coreano e giapponese. + + --- + + # 🎤 Come utilizzare le voci R.V.C. e R.V.C.2 (Opzionale) 🎶 + + L'obiettivo è applicare un R.V.C. al TTS (Text-to-Speech) generato 🎙️ + + 1. Nella scheda `Custom Voice R.V.C.`, scarica i modelli di cui hai bisogno 📥 Puoi utilizzare link da Hugging Face e Google Drive in formati come zip, pth o indice. Puoi anche scaricare repository completi di spazio HF, ma questa opzione non è molto stabile 😕 + + 2. Ora, vai su `Sostituisci voce: TTS a R.V.C.` e spunta la casella `abilita` ✅ Dopo questo, puoi scegliere i modelli che desideri applicare a ciascun altoparlante TTS 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Regola il metodo F0 che verrà applicato a tutti i R.V.C. 🎛️ + + 4. Premi `APPLICA CONFIGURAZIONE` per applicare le modifiche apportate 🔄 + + 5. Torna alla scheda di traduzione video e clicca su 'Traduci' ▶️ Ora, la traduzione verrà effettuata applicando il R.V.C. 🗣️ + + Suggerimento: Puoi utilizzare `Test R.V.C.` per sperimentare e trovare il miglior TTS o configurazioni da applicare al R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "Traduzione video", + "video_source": "Scegli la fonte video", + "link_label": "Link multimediale.", + "link_info": "Esempio: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "Inserisci l'URL qui...", + "dir_label": "Percorso video.", + "dir_info": "Esempio: /usr/home/mio_video.mp4", + "dir_ph": "Inserisci il percorso qui...", + "sl_label": "Lingua di origine", + "sl_info": "Questa è la lingua originale del video", + "tat_label": "Traduci l'audio in", + "tat_info": "Seleziona la lingua di destinazione e assicurati anche di scegliere il TTS corrispondente per quella lingua.", + "num_speakers": "Seleziona quanti parlano nel video.", + "min_sk": "Numero minimo di altoparlanti", + "max_sk": "Numero massimo di altoparlanti", + "tts_select": "Seleziona la voce desiderata per ogni altoparlante.", + "sk1": "Altoparlante TTS 1", + "sk2": "Altoparlante TTS 2", + "sk3": "Altoparlante TTS 3", + "sk4": "Altoparlante TTS 4", + "sk5": "Altoparlante TTS 5", + "sk6": "Altoparlante TTS 6", + "sk7": "Altoparlante TTS 7", + "sk8": "Altoparlante TTS 8", + "sk9": "Altoparlante TTS 9", + "sk10": "Altoparlante TTS 10", + "sk11": "Altoparlante TTS 11", + "sk12": "Altoparlante TTS 12", + "vc_title": "Imitazione della voce in diverse lingue", + "vc_subtitle": """ + ### Replica la voce di una persona in varie lingue. + Sebbene efficace con la maggior parte delle voci quando usato correttamente, potrebbe non raggiungere la perfezione in ogni caso. + L'imitazione della voce replica esclusivamente il tono del locutore di riferimento, escludendo accento ed emozione, che sono governati dal modello TTS del locutore di base e non replicati dal convertitore. + Questo prenderà campioni audio dall'audio principale per ciascun altoparlante e li elaborerà. + """, + "vc_active_label": "Imitazione attiva della voce", + "vc_active_info": "Imitazione attiva della voce: Replica il tono del locutore originale", + "vc_method_label": "Metodo", + "vc_method_info": "Seleziona un metodo per il processo di imitazione della voce", + "vc_segments_label": "Campioni massimi", + "vc_segments_info": "Campioni massimi: è il numero di campioni audio che verranno generati per il processo, più è meglio ma può aggiungere rumore", + "vc_dereverb_label": "Dereverb", + "vc_dereverb_info": "Dereverb: Applica dereverb vocale ai campioni audio.", + "vc_remove_label": "Rimuovi campioni precedenti", + "vc_remove_info": "Rimuovi campioni precedenti: Rimuove i campioni precedenti generati, quindi è necessario crearne di nuovi.", + "xtts_title": "Crea un TTS basato su un audio", + "xtts_subtitle": "Carica un file audio di massimo 10 secondi con una voce. Utilizzando XTTS, verrà creato un nuovo TTS con una voce simile al file audio fornito.", + "xtts_file_label": "Carica un breve audio con la voce", + "xtts_name_label": "Nome per il TTS", + "xtts_name_info": "Utilizza un nome semplice", + "xtts_dereverb_label": "Dereverb audio", + "xtts_dereverb_info": "Dereverb audio: Applica dereverb vocale all'audio", + "xtts_button": "Elabora l'audio e includilo nel selettore TTS", + "xtts_footer": "Genera automaticamente XTTS vocale: Puoi usare `_XTTS_/AUTOMATIC.wav` nel selettore TTS per generare automaticamente segmenti per ciascun altoparlante durante la generazione della traduzione.", + "extra_setting": "Impostazioni avanzate", + "acc_max_label": "Massima accelerazione audio", + "acc_max_info": "Massima accelerazione per i segmenti audio tradotti per evitare sovrapposizioni. Un valore di 1,0 rappresenta nessuna accelerazione", + "acc_rate_label": "Regolazione del tasso di accelerazione", + "acc_rate_info": "Regolazione del tasso di accelerazione: Regola l'accelerazione per adattarsi ai segmenti che richiedono una velocità inferiore, mantenendo la continuità e considerando il timing di avvio successivo.", + "or_label": "Riduzione Sovrapposizione", + "or_info": "Riduzione Sovrapposizione: Assicura che i segmenti non si sovrappongano regolando gli orari di inizio in base agli orari di fine precedenti; potrebbe interrompere la sincronizzazione.", + "aud_mix_label": "Metodo di mixing audio", + "aud_mix_info": "Mixa file audio originali e tradotti per creare un output personalizzato e bilanciato con due modalità di mixing disponibili.", + "vol_ori": "Volume audio originale", + "vol_tra": "Volume audio tradotto", + "voiceless_tk_label": "Traccia senza voce", + "voiceless_tk_info": "Traccia senza voce: Rimuove le voci audio originali prima di combinarle con l'audio tradotto.", + "sub_type": "Tipo di sottotitolo", + "soft_subs_label": "Sottotitoli Soft", + "soft_subs_info": "Sottotitoli Soft: Sottotitoli opzionali che gli spettatori possono attivare o disattivare durante la visione del video.", + "burn_subs_label": "Incorpora sottotitoli", + "burn_subs_info": "Incorpora sottotitoli: Incorpora i sottotitoli nel video, rendendoli una parte permanente del contenuto visivo.", + "whisper_title": "Configura la trascrizione.", + "lnum_label": "Literalizzare Numeri", + "lnum_info": "Literalizzare Numeri: Sostituisci le rappresentazioni numeriche con i loro equivalenti scritti nella trascrizione.", + "scle_label": "Pulizia del Suono", + "scle_info": "Pulizia del Suono: Migliora le voci, rimuovi il rumore di fondo prima della trascrizione per una massima precisione dei timestamp. Questa operazione può richiedere del tempo, specialmente con file audio lunghi.", + "sd_limit_label": "Limite Durata Segmento", + "sd_limit_info": "Specifica la durata massima (in secondi) per ogni segmento. L'audio verrà elaborato utilizzando VAD, limitando la durata per ciascun frammento di segmento.", + "asr_model_info": "Converte il linguaggio parlato in testo utilizzando il modello 'Whisper' per impostazione predefinita. Utilizza un modello personalizzato, ad esempio, inserendo il nome del repository 'BELLE-2/Belle-whisper-large-v3-zh' nel menu a discesa per utilizzare un modello pre-ottimizzato in cinese. Trova modelli pre-ottimizzati su Hugging Face.", + "ctype_label": "Tipo di Calcolo", + "ctype_info": "Scegliere tipi più piccoli come int8 o float16 può migliorare le prestazioni riducendo l'utilizzo della memoria e aumentando il throughput computazionale, ma può sacrificare la precisione rispetto a tipi di dati più grandi come float32.", + "batchz_label": "Dimensione Batch", + "batchz_info": "Ridurre la dimensione del batch consente di risparmiare memoria se la tua GPU ha meno VRAM e aiuta a gestire i problemi di memoria esaurita.", + "tsscale_label": "Scala di Segmentazione del Testo", + "tsscale_info": "Dividi il testo in segmenti per frasi, parole o caratteri. La segmentazione per parole e caratteri offre una granularità più fine, utile per i sottotitoli; disabilitare la traduzione conserva la struttura originale.", + "srt_file_label": "Carica un file sottotitoli SRT (verrà utilizzato al posto della trascrizione di Whisper)", + "divide_text_label": "Ridividi i segmenti di testo per:", + "divide_text_info": "(Sperimentale) Inserisci un separatore per dividere i segmenti di testo esistenti nella lingua di origine. Lo strumento identificherà le occorrenze e creerà nuovi segmenti di conseguenza. Specifica più separatori usando |, ad esempio: !|?|...|。", + "diarization_label": "Modello di diarizzazione", + "tr_process_label": "Processo di traduzione", + "out_type_label": "Tipo di output", + "out_name_label": "Nome del file", + "out_name_info": "Il nome del file di output", + "task_sound_label": "Suono dello stato del compito", + "task_sound_info": "Suono dello stato del compito: Riproduce un segnale acustico che indica il completamento del compito o gli errori durante l'esecuzione.", + "cache_label": "Recupero Progresso", + "cache_info": "Recupero Progresso: Continua il processo dall'ultimo checkpoint.", + "preview_info": "La preview taglia il video a soli 10 secondi per scopi di test. Disattivala per ripristinare la durata completa del video.", + "edit_sub_label": "Modifica i sottotitoli generati", + "edit_sub_info": "Modifica i sottotitoli generati: Ti consente di eseguire la traduzione in 2 passaggi. Prima con il pulsante 'OTTIENI SOTTOTITOLI E MODIFICA', ottieni i sottotitoli per modificarli, e poi con il pulsante 'TRADUCI', puoi generare il video", + "button_subs": "OTTIENI SOTTOTITOLI E MODIFICA", + "editor_sub_label": "Sottotitoli generati", + "editor_sub_info": "Modifica il testo nei sottotitoli generati qui. Puoi apportare modifiche alle opzioni dell'interfaccia prima di fare clic sul pulsante 'TRADUCI', ad eccezione di 'Lingua di origine', 'Traduci l'audio in' e 'Numero massimo di altoparlanti', per evitare errori. Una volta finito, fai clic sul pulsante 'TRADUCI'.", + "editor_sub_ph": "Prima premi 'OTTIENI SOTTOTITOLI E MODIFICA' per ottenere i sottotitoli", + "button_translate": "TRADUCI", + "output_result_label": "SCARICA VIDEO TRADOTTO", + "sub_ori": "Sottotitoli", + "sub_tra": "Sottotitoli tradotti", + "ht_token_info": "Un passaggio importante è accettare l'accordo di licenza per l'uso di Pyannote. È necessario avere un account su Hugging Face e accettare la licenza per utilizzare i modelli: https://huggingface.co/pyannote/speaker-diarization e https://huggingface.co/pyannote/segmentation. Ottieni il tuo TOKEN CHIAVE qui: https://hf.co/settings/tokens", + "ht_token_ph": "Inserisci il token qui...", + "tab_docs": "Traduzione documenti", + "docs_input_label": "Scegli la fonte del documento", + "docs_input_info": "Può essere PDF, DOCX, TXT o testo", + "docs_source_info": "Questa è la lingua originale del testo", + "chunk_size_label": "Numero massimo di caratteri che il TTS elaborerà per segmento", + "chunk_size_info": "Un valore di 0 assegna un valore dinamico e più compatibile per il TTS.", + "docs_button": "Avvia ponte di conversione linguistica", + "cv_url_info": "Scarica automaticamente i modelli R.V.C. dall'URL. Puoi utilizzare link da HuggingFace o Drive e puoi includere diversi link, ognuno separato da una virgola. Esempio: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Sostituisci voce: TTS a R.V.C.", + "sec1_title": "### 1. Per abilitarne l'uso, contrassegnalo come abilitato.", + "enable_replace": "Seleziona questa opzione per abilitare l'uso dei modelli.", + "sec2_title": "### 2. Seleziona una voce che verrà applicata a ciascun TTS di ciascun altoparlante corrispondente e applica le configurazioni.", + "sec2_subtitle": "A seconda di quanti utilizzerai, ognuno avrà bisogno del proprio modello. Inoltre, c'è un modello ausiliario nel caso in cui il parlante non venga rilevato correttamente.", + "cv_tts1": "Scegli la voce da applicare per l'Altoparlante 1.", + "cv_tts2": "Scegli la voce da applicare per l'Altoparlante 2.", + "cv_tts3": "Scegli la voce da applicare per l'Altoparlante 3.", + "cv_tts4": "Scegli la voce da applicare per l'Altoparlante 4.", + "cv_tts5": "Scegli la voce da applicare per l'Altoparlante 5.", + "cv_tts6": "Scegli la voce da applicare per l'Altoparlante 6.", + "cv_tts7": "Scegli la voce da applicare per l'Altoparlante 7.", + "cv_tts8": "Scegli la voce da applicare per l'Altoparlante 8.", + "cv_tts9": "Scegli la voce da applicare per l'Altoparlante 9.", + "cv_tts10": "Scegli la voce da applicare per l'Altoparlante 10.", + "cv_tts11": "Scegli la voce da applicare per l'Altoparlante 11.", + "cv_tts12": "Scegli la voce da applicare per l'Altoparlante 12.", + "cv_aux": "- Voce da applicare nel caso in cui un altoparlante non venga rilevato correttamente.", + "cv_button_apply": "APPLICA CONFIGURAZIONE", + "tab_help": "Aiuto", + }, + "japanese": { + "description": """ + ### 🎥 **SoniTranslateで簡単に動画を翻訳しましょう!** 📽️ + + 動画、音声ファイルをアップロードするか、YouTubeのリンクを提供してください。📽️ **公式リポジトリから最新のノートブックを入手する: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + 使用方法についての指示は`ヘルプ`タブを参照してください。動画翻訳を楽しんでみましょう! 🚀🎉 + """, + "tutorial": """ + # 🔰 **使用方法:** + + 1. 📤 **動画**、**音声ファイル**をアップロードするか、🌐 **YouTubeのリンク**を提供します。 + + 2. 🌍 **動画を翻訳する言語**を選択します。 + + 3. 🗣️ **動画内の話者の数**を指定し、それぞれの話者に翻訳言語に適したテキスト読み上げ音声を割り当てます。 + + 4. 🚀 '**翻訳**'ボタンを押して結果を取得します。 + + --- + + # 🧩 **SoniTranslateはさまざまなTTS(テキスト読み上げ)エンジンをサポートしています。これらは次のとおりです:** + - EDGE-TTS → 形式 `en-AU-WilliamNeural-Male` → 速く正確です。 + - FACEBOOK MMS → 形式 `en-facebook-mms VITS` → 音声がより自然です。現時点ではCPUのみを使用します。 + - PIPER TTS → 形式 `en_US-lessac-high VITS-onnx` → 前述のものと同じですが、CPUとGPUの両方に最適化されています。 + - BARK → 形式 `en_speaker_0-Male BARK` → 品質は良好ですが、遅く、幻覚に陥りやすいです。 + - OpenAI TTS → フォーマット `>alloy OpenAI-TTS` → 多言語対応ですが、OpenAIのAPIキーが必要です + - Coqui XTTS → 形式 `_XTTS_/AUTOMATIC.wav` → 中国語(簡体字)、英語、フランス語、ドイツ語、イタリア語、ポルトガル語、ポーランド語、トルコ語、ロシア語、オランダ語、チェコ語、アラビア語、スペイン語、ハンガリー語、韓国語、日本語のみ利用可能です。 + + --- + + # 🎤 R.V.C.とR.V.C.2ボイスの使用方法(オプション) 🎶 + + 目標は、生成されたTTS(テキスト読み上げ)にR.V.C.を適用することです 🎙️ + + 1. `カスタムボイスR.V.C.`タブで、必要なモデルをダウンロードしてください 📥 Hugging FaceやGoogle Driveからのリンクを使用できます。zip、pth、またはindexなどの形式を使用できます。完全なHFスペースリポジトリをダウンロードすることもできますが、このオプションはあまり安定していません 😕 + + 2. 今度は、`TTSからR.V.C.への置換`に移動し、`有効`ボックスをチェックします ✅ これ以降、各TTSスピーカーに適用するモデルを選択できます 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. すべてのR.V.C.に適用されるF0メソッドを調整します 🎛️ + + 4. 変更した設定を適用するには、`設定を適用`を押します 🔄 + + 5. 動画翻訳タブに戻り、「翻訳」をクリックします ▶️ これで、R.V.C.を適用して翻訳が行われます 🗣️ + + ヒント: `テストR.V.C.`を使用して、適用する最適なTTSまたは設定を実験し、見つけることができます 🧪🔍 + + --- + + """, + "tab_translate": "動画翻訳", + "video_source": "動画ソースを選択", + "link_label": "メディアリンク。", + "link_info": "例: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URLをここに入力...", + "dir_label": "ビデオパス。", + "dir_info": "例: /usr/home/my_video.mp4", + "dir_ph": "パスをここに入力...", + "sl_label": "元の言語", + "sl_info": "動画の元の言語です", + "tat_label": "翻訳先の言語", + "tat_info": "対象言語を選択し、その言語に対応するTTSを選択することも忘れないでください。", + "num_speakers": "ビデオ内の話者の数を選択してください。", + "min_sk": "最小スピーカー", + "max_sk": "最大スピーカー", + "tts_select": "各スピーカーに適した音声を選択してください。", + "sk1": "TTSスピーカー1", + "sk2": "TTSスピーカー2", + "sk3": "TTSスピーカー3", + "sk4": "TTSスピーカー4", + "sk5": "TTSスピーカー5", + "sk6": "TTSスピーカー6", + "sk7": "TTSスピーカー7", + "sk8": "TTSスピーカー8", + "sk9": "TTSスピーカー9", + "sk10": "TTSスピーカー10", + "sk11": "TTSスピーカー11", + "sk12": "TTSスピーカー12", + "vc_title": "異なる言語での音声模倣", + "vc_subtitle": """ + ### さまざまな言語で人の声を再現します。 + 適切に使用されるとほとんどの声に効果的ですが、すべての場合に完璧な結果が得られるわけではありません。 + 音声模倣は、アクセントや感情を除く参照スピーカーの音色のみを再現し、これらは基本スピーカーTTSモデルによって制御され、変換器によっては再現されません。 + これにより、各話者のメインオーディオからオーディオサンプルを取得し、処理します。 + """, + "vc_active_label": "アクティブ音声模倣", + "vc_active_info": "アクティブ音声模倣:元のスピーカーの音色を再現します", + "vc_method_label": "メソッド", + "vc_method_info": "音声模倣プロセスのメソッドを選択します", + "vc_segments_label": "最大サンプル数", + "vc_segments_info": "最大サンプル数:プロセスに使用されるオーディオサンプルの数。より多いほど良いですが、ノイズが発生する可能性があります", + "vc_dereverb_label": "リバーブを除去", + "vc_dereverb_info": "リバーブを除去:オーディオサンプルにボーカルリバーブを適用します。", + "vc_remove_label": "以前のサンプルを削除", + "vc_remove_info": "以前のサンプルを削除:以前に生成されたサンプルを削除し、新しいサンプルを作成する必要があります。", + "xtts_title": "オーディオを基にTTSを作成する", + "xtts_subtitle": "声が入った最大10秒のオーディオファイルをアップロードします。 XTTSを使用すると、提供されたオーディオファイルに似た声で新しいTTSが作成されます。", + "xtts_file_label": "声の入った短いオーディオをアップロードしてください", + "xtts_name_label": "TTSの名前", + "xtts_name_info": "簡単な名前を使用してください", + "xtts_dereverb_label": "オーディオのリバーブを除去", + "xtts_dereverb_info": "オーディオのリバーブを除去:オーディオにボーカルリバーブを適用します", + "xtts_button": "オーディオを処理してTTSセレクタに含めます", + "xtts_footer": "音声xttsを自動生成する:翻訳を生成する際に各話者のセグメントを自動生成するために、TTSセレクタで`_XTTS_/AUTOMATIC.wav`を使用できます。", + "extra_setting": "高度な設定", + "acc_max_label": "最大オーディオ加速度", + "acc_max_info": "オーバーラップを回避するための翻訳されたオーディオセグメントの最大加速度。値が1.0の場合、加速度はありません", + "acc_rate_label": "加速度調整", + "acc_rate_info": "加速度調整:速度が低いセグメントに適応するために加速度を調整し、連続性を保ち、次の開始時刻を考慮します。", + "or_label": "重複削減", + "or_info": "重複削減:前の終了時間に基づいて開始時間を調整してセグメントが重複しないようにします。同期を妨げる可能性があります。", + "aud_mix_label": "オーディオミキシング方法", + "aud_mix_info": "オリジナルと翻訳されたオーディオファイルを混合してカスタマイズされたバランスの取れた出力を作成するための2つの利用可能なミキシングモード。", + "vol_ori": "元のオーディオの音量", + "vol_tra": "翻訳されたオーディオの音量", + "voiceless_tk_label": "声なしトラック", + "voiceless_tk_info": "声なしトラック:翻訳されたオーディオと組み合わせる前に元のオーディオの音声を削除します。", + "sub_type": "字幕タイプ", + "soft_subs_label": "ソフトサブタイトル", + "soft_subs_info": "ソフトサブタイトル:視聴者がビデオを見ながらオンまたはオフにできるオプションの字幕。", + "burn_subs_label": "字幕を焼く", + "burn_subs_info": "字幕を焼く:字幕をビデオに埋め込み、それを視覚コンテンツの恒久的な一部にします。", + "whisper_title": "トランスクリプションの構成。", + "lnum_label": "数値の表現化", + "lnum_info": "数値の表現化:トランスクリプト内の数値表現を書き換えて、数値を文字列に変換します。", + "scle_label": "音声のクリーンアップ", + "scle_info": "音声のクリーンアップ:トランスクリプトの時間スタンプの精度を最大限に高めるために、ボーカルを強調し、背景ノイズを除去します。この操作には時間がかかる場合があります。特に長時間のオーディオファイルの場合。", + "sd_limit_label": "セグメントの長さ制限", + "sd_limit_info": "各セグメントの最大長(秒単位)を指定します。オーディオはVADを使用して処理され、各セグメントチャンクの長さが制限されます。", + "asr_model_info": "デフォルトでは、「Whisperモデル」を使用して、音声をテキストに変換します。カスタムモデルを使用するには、ドロップダウンでリポジトリ名「BELLE-2/Belle-whisper-large-v3-zh」を入力して、中国語の言語を微調整したモデルを利用します。 Hugging Faceで微調整されたモデルを見つけます。", + "ctype_label": "計算タイプ", + "ctype_info": "int8やfloat16などの小さなタイプを選択すると、メモリ使用量が減少し、計算スループットが増加してパフォーマンスが向上しますが、float32などの大きなデータタイプと比較して精度が低下する場合があります。", + "batchz_label": "バッチサイズ", + "batchz_info": "バッチサイズを減らすと、GPUのVRAMが少ない場合にメモリを節約し、メモリ不足の問題を管理するのに役立ちます。", + "tsscale_label": "テキストのセグメンテーションスケール", + "tsscale_info": "テキストを文、単語、または文字でセグメントに分割します。単語と文字のセグメンテーションは、字幕などの細かい粒度の処理に役立ちます。翻訳を無効にすると、元の構造が保持されます。", + "srt_file_label": "SRT字幕ファイルをアップロードしてください(Whisperのトランスクリプションの代わりに使用されます)", + "divide_text_label": "次のようにテキストセグメントを再分割します:", + "divide_text_info": "(実験的) ソース言語の既存のテキストセグメントを分割するセパレーターを入力します。ツールは出現を識別し、適切な箇所で新しいセグメントを作成します。複数のセパレーターを | を使用して指定します。例: !|?|...|。", + "diarization_label": "ダイアライゼーションモデル", + "tr_process_label": "翻訳プロセス", + "out_type_label": "出力タイプ", + "out_name_label": "ファイル名", + "out_name_info": "出力ファイルの名前", + "task_sound_label": "タスクステータスサウンド", + "task_sound_info": "タスクステータスサウンド:タスクの完了または実行中のエラーを示す音声アラートを再生します。", + "cache_label": "進捗を取得", + "cache_info": "進捗を取得:最後のチェックポイントからプロセスを継続します。", + "preview_info": "テスト目的でビデオを10秒に切り取ります。完全なビデオの長さを取得するには、これを無効にしてください。", + "edit_sub_label": "生成された字幕を編集", + "edit_sub_info": "生成された字幕の翻訳を2段階で実行できます。まず、「字幕を取得して編集」ボタンをクリックして字幕を取得して編集し、次に「翻訳」ボタンをクリックしてビデオを生成できます。", + "button_subs": "字幕を取得して編集", + "editor_sub_label": "生成された字幕", + "editor_sub_info": "ここで生成された字幕のテキストを自由に編集してください。エラーを回避するために、インターフェイスのオプションを変更する前に「元の言語」、「翻訳先の言語」、「最大スピーカー」を除く、[翻訳]ボタンをクリックしてください。編集が完了したら、「翻訳」ボタンをクリックします。", + "editor_sub_ph": "まず、「字幕を取得して編集」を押して字幕を取得してください", + "button_translate": "翻訳", + "output_result_label": "翻訳された動画をダウンロード", + "sub_ori": "字幕", + "sub_tra": "翻訳された字幕", + "ht_token_info": "重要なステップの1つは、Pyannoteのライセンス契約を受諾することです。これには、Hugging Faceにアカウントを持ち、モデルの使用許可を受け入れる必要があります: https://huggingface.co/pyannote/speaker-diarization および https://huggingface.co/pyannote/segmentation. ここでキー トークンを取得します: https://hf.co/settings/tokens", + "ht_token_ph": "トークンをここに入力...", + "tab_docs": "ドキュメント翻訳", + "docs_input_label": "ドキュメントソースを選択", + "docs_input_info": "PDF、DOCX、TXT、またはテキストである可能性があります", + "docs_source_info": "これはテキストの元の言語です", + "chunk_size_label": "TTSがセグメントごとに処理する最大文字数", + "chunk_size_info": "値が0の場合、TTSに動的で互換性のある値が割り当てられます。", + "docs_button": "言語変換ブリッジを開始", + "cv_url_info": "URLからR.V.C.モデルを自動的にダウンロードします。HuggingFaceまたはドライブからのリンクを使用でき、各リンクをコンマで区切って複数のリンクを含めることができます。例: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "音声を置換: TTSからR.V.C.へ。", + "sec1_title": "### 1. 使用を有効にするには、それを有効にします。", + "enable_replace": "このオプションをチェックして、モデルの使用を有効にします。", + "sec2_title": "### 2. 各対応する話者のTTSに適用される音声を選択し、設定を適用します。", + "sec2_subtitle": "使用するの数に応じて、それぞれに対応するモデルが必要です。また、スピーカーが正しく検出されない場合のために補助的なモデルもあります。", + "cv_tts1": "スピーカー1に適用する音声を選択してください。", + "cv_tts2": "スピーカー2に適用する音声を選択してください。", + "cv_tts3": "スピーカー3に適用する音声を選択してください。", + "cv_tts4": "スピーカー4に適用する音声を選択してください。", + "cv_tts5": "スピーカー5に適用する音声を選択してください。", + "cv_tts6": "スピーカー6に適用する音声を選択してください。", + "cv_tts7": "スピーカー7に適用する音声を選択してください。", + "cv_tts8": "スピーカー8に適用する音声を選択してください。", + "cv_tts9": "スピーカー9に適用する音声を選択してください。", + "cv_tts10": "スピーカー10に適用する音声を選択してください。", + "cv_tts11": "スピーカー11に適用する音声を選択してください。", + "cv_tts12": "スピーカー12に適用する音声を選択してください。", + "cv_aux": "- スピーカーが正常に検出されない場合に適用する音声。", + "cv_button_apply": "設定を適用", + "tab_help": "ヘルプ", + }, + "chinese_zh_cn": { + "description": """ + ### 🎥 **使用SoniTranslate轻松翻译视频!** 📽️ + + 上传视频、音频文件或提供YouTube链接。 📽️ **从官方存储库获取更新的笔记本:[SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + 查看`帮助`标签以获取如何使用的说明。让我们开始享受视频翻译的乐趣吧! 🚀🎉 + """, + "tutorial": """ + # 🔰 **使用说明:** + + 1. 📤 上传**视频**、**音频文件**或提供🌐 **YouTube链接**。 + + 2. 🌍 选择您要**翻译视频**的语言。 + + 3. 🗣️ 指定视频中**发言人数量**并为每个人分配适合翻译语言的文本到语音(TTS)声音。 + + 4. 🚀 按下 '**翻译**' 按钮获取结果。 + + --- + + # 🧩 **SoniTranslate支持不同的TTS(文本到语音)引擎,包括:** + - EDGE-TTS → 格式 `en-AU-WilliamNeural-Male` → 快速而准确。 + - FACEBOOK MMS → 格式 `en-facebook-mms VITS` → 声音更自然;目前仅使用CPU。 + - PIPER TTS → 格式 `en_US-lessac-high VITS-onnx` → 与前一款相同,但针对CPU和GPU进行了优化。 + - BARK → 格式 `en_speaker_0-Male BARK` → 质量良好但速度较慢,易产生幻觉。 + - OpenAI TTS → 格式 `>alloy OpenAI-TTS` → 多语言但需要 OpenAI API key + - Coqui XTTS → 格式 `_XTTS_/AUTOMATIC.wav` → 仅支持简体中文、英文、法文、德文、意大利文、葡萄牙文、波兰文、土耳其文、俄文、荷兰文、捷克文、阿拉伯文、西班牙文、匈牙利文、韩文和日文。 + + --- + + # 🎤 如何使用R.V.C.和R.V.C.2语音(可选)🎶 + + 目标是将R.V.C.应用于生成的TTS(文本到语音)🎙️ + + 1. 在`自定义语音R.V.C.`标签中,下载您需要的模型📥 您可以使用Hugging Face和Google Drive的链接,格式如zip、pth或index。您还可以下载完整的HF空间存储库,但此选项不太稳定😕 + + 2. 现在,转到`替换语音:TTS到R.V.C.`并选中`启用`框✅ 然后,您可以选择要应用于每个TTS发言者的模型👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. 调整将应用于所有R.V.C.的F0方法🎛️ + + 4. 按下`应用配置`以应用所做的更改🔄 + + 5. 返回视频翻译标签,单击 '翻译' ▶️ 现在,将应用R.V.C.进行翻译。🗣️ + + 提示:您可以使用`测试R.V.C.`来进行实验,找到要应用于R.V.C.的最佳TTS或配置🧪🔍 + + --- + + """, + "tab_translate": "视频翻译", + "video_source": "选择视频来源", + "link_label": "媒体链接。", + "link_info": "示例:www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL放这里...", + "dir_label": "视频路径。", + "dir_info": "示例:/usr/home/my_video.mp4", + "dir_ph": "路径放这里...", + "sl_label": "源语言", + "sl_info": "这是视频的原始语言", + "tat_label": "翻译成", + "tat_info": "选择目标语言,同时确保选择该语言对应的TTS。", + "num_speakers": "选择视频中有多少个人在说话。", + "min_sk": "最少发言者", + "max_sk": "最多发言者", + "tts_select": "为每个发言者选择您想要的语音。", + "sk1": "TTS发言者 1", + "sk2": "TTS发言者 2", + "sk3": "TTS发言者 3", + "sk4": "TTS发言者 4", + "sk5": "TTS发言者 5", + "sk6": "TTS发言者 6", + "sk7": "TTS发言者 7", + "sk8": "TTS发言者 8", + "sk9": "TTS发言者 9", + "sk10": "TTS发言者 10", + "sk11": "TTS发言者 11", + "sk12": "TTS发言者 12", + "vc_title": "不同语言的语音模仿", + "vc_subtitle": """ + ### 在各种语言中复制一个人的声音。 + 当适当使用时,大多数声音都很有效,但并不是每种情况都能达到完美。 + 语音模仿仅复制参考发言者的音调,不包括口音和情感,这些由基础发言者TTS模型控制,并且不会被转换器复制。 + 这将从主音频中获取每个发言者的音频样本并处理它们。 + """, + "vc_active_label": "激活语音模仿", + "vc_active_info": "激活语音模仿:复制原始发言者的音调", + "vc_method_label": "方法", + "vc_method_info": "选择语音模仿过程的方法", + "vc_segments_label": "最大样本数", + "vc_segments_info": "最大样本数:是将用于处理的音频样本数量,越多越好,但可能会添加噪音", + "vc_dereverb_label": "去混响", + "vc_dereverb_info": "去混响:将声音去除混响应用于音频样本。", + "vc_remove_label": "删除先前的样本", + "vc_remove_info": "删除先前的样本:删除先前生成的样本,因此需要创建新样本。", + "xtts_title": "基于音频创建TTS", + "xtts_subtitle": "上传最长10秒的带有声音的音频文件。使用XTTS,将创建一个与提供的音频文件类似的新TTS。", + "xtts_file_label": "上传具有声音的短音频", + "xtts_name_label": "TTS的名称", + "xtts_name_info": "使用简单的名称", + "xtts_dereverb_label": "去混响音频", + "xtts_dereverb_info": "去混响音频:将声音去除混响应用于音频", + "xtts_button": "处理音频并将其包含在TTS选择器中", + "xtts_footer": "自动生成语音xtts:您可以在TTS选择器中使用 `_XTTS_/AUTOMATIC.wav` 来为每个发言者自动生成片段,以用于生成翻译时。", + "extra_setting": "高级设置", + "acc_max_label": "最大音频加速度", + "acc_max_info": "翻译音频段的最大加速度,以避免重叠。值为1.0表示无加速度", + "acc_rate_label": "加速度调节", + "acc_rate_info": "加速度调节:调整加速度以适应需要较低速度的片段,保持连续性并考虑下一个开始的时机。", + "or_label": "重叠减少", + "or_info": "重叠减少:通过根据先前的结束时间调整开始时间来确保片段不重叠;可能会干扰同步。", + "aud_mix_label": "音频混合方法", + "aud_mix_info": "混合原始和翻译音频文件,以创建平衡的定制输出,提供两种可用的混合模式。", + "vol_ori": "原始音频音量", + "vol_tra": "翻译音频音量", + "voiceless_tk_label": "无声音轨", + "voiceless_tk_info": "无声音轨:在将其与翻译音频结合之前删除原始音频声音。", + "sub_type": "字幕类型", + "soft_subs_label": "软字幕", + "soft_subs_info": "软字幕:观众在观看视频时可以选择打开或关闭的可选字幕。", + "burn_subs_label": "烧录字幕", + "burn_subs_info": "烧录字幕:将字幕嵌入视频中,使其成为视觉内容的永久部分。", + "whisper_title": "配置转录。", + "lnum_label": "数字文字化", + "lnum_info": "数字文字化:将数字表示替换为其在转录中的书面等价物。", + "scle_label": "声音清理", + "scle_info": "声音清理:增强语音,消除转录之前的背景噪音,以实现最大的时间戳精度。此操作可能需要一些时间,特别是对于较长的音频文件。", + "sd_limit_label": "段落时长限制", + "sd_limit_info": "指定每个段落的最大持续时间(以秒为单位)。将使用VAD处理音频,以限制每个段落块的持续时间。", + "asr_model_info": "默认情况下,它使用“Whisper模型”将口语转换为文本。使用自定义模型,例如,在下拉菜单中输入存储库名称“BELLE-2/Belle-whisper-large-v3-zh”以使用经过中文语言微调的模型。在Hugging Face上找到微调模型。", + "ctype_label": "计算类型", + "ctype_info": "选择较小的类型,如int8或float16,可以通过减少内存使用量和增加计算吞吐量来提高性能,但可能会牺牲与float32等较大数据类型相比的精度。", + "batchz_label": "批处理大小", + "batchz_info": "如果您的GPU的VRAM较少,则减小批处理大小可以节省内存,并有助于管理内存不足问题。", + "tsscale_label": "文本分段比例", + "tsscale_info": "按句子、单词或字符将文本分成段。按单词和字符进行分段可提供更精细的粒度,适用于字幕等用途;禁用翻译将保留原始结构。", + "srt_file_label": "上传SRT字幕文件(将用于替代Whisper的转录)", + "divide_text_label": "通过以下方式重新划分文本段:", + "divide_text_info": "(实验性)输入用于拆分源语言中现有文本段的分隔符。该工具将识别出现并相应地创建新段。使用|指定多个分隔符,例如:!|?|...|。", + "diarization_label": "辨识模型", + "tr_process_label": "翻译过程", + "out_type_label": "输出类型", + "out_name_label": "文件名", + "out_name_info": "输出文件的名称", + "task_sound_label": "任务状态声音", + "task_sound_info": "任务状态声音:播放指示任务完成或执行过程中错误的声音警报。", + "cache_label": "恢复进度", + "cache_info": "恢复进度:从上一个检查点继续进行流程。", + "preview_info": "预览将视频裁剪为仅10秒以进行测试。请在检索完整视频持续时间之前停用它。", + "edit_sub_label": "编辑生成的字幕", + "edit_sub_info": "编辑生成的字幕:允许您分两步运行翻译。首先使用 '获取字幕并编辑' 按钮获取字幕以编辑它们,然后使用 '翻译' 按钮生成视频", + "button_subs": "获取字幕并编辑", + "editor_sub_label": "生成的字幕", + "editor_sub_info": "请在此处编辑生成的字幕中的文本。您可以在点击 '翻译' 按钮之前更改界面选项,但不能更改 '源语言'、'翻译成' 和 '最多发言者',以避免错误。编辑完成后,点击 '翻译' 按钮。", + "editor_sub_ph": "首先点击 '获取字幕并编辑' 获取字幕", + "button_translate": "翻译", + "output_result_label": "下载翻译视频", + "sub_ori": "字幕", + "sub_tra": "翻译字幕", + "ht_token_info": "一个重要步骤是接受使用Pyannote的许可协议。您需要在Hugging Face上拥有一个帐户,并接受使用模型的许可:https://huggingface.co/pyannote/speaker-diarization 和 https://huggingface.co/pyannote/segmentation。在此处获取您的密钥令牌:https://hf.co/settings/tokens", + "ht_token_ph": "令牌放这里...", + "tab_docs": "文档翻译", + "docs_input_label": "选择文档来源", + "docs_input_info": "可以是PDF、DOCX、TXT或文本", + "docs_source_info": "这是文本的原始语言", + "chunk_size_label": "TTS每个段处理的最大字符数", + "chunk_size_info": "值为0分配了一个动态且更兼容的值给TTS。", + "docs_button": "开始语言转换桥", + "cv_url_info": "从URL自动下载R.V.C.模型。您可以使用HuggingFace或Drive的链接,您可以包括多个链接,每个链接用逗号分隔。示例:https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "替换语音:TTS到R.V.C.", + "sec1_title": "### 1. 要启用其使用,请将其标记为启用。", + "enable_replace": "选中此框以启用模型的使用。", + "sec2_title": "### 2. 选择将应用于每个相应发言者的TTS的声音,并应用配置。", + "sec2_subtitle": "根据您将使用的的数量,每个都需要其各自的模型。此外,如果某种原因未正确检测到发言者,则还有一个辅助模型。", + "cv_tts1": "选择要为发言者 1 应用的声音。", + "cv_tts2": "选择要为发言者 2 应用的声音。", + "cv_tts3": "选择要为发言者 3 应用的声音。", + "cv_tts4": "选择要为发言者 4 应用的声音。", + "cv_tts5": "选择要为发言者 5 应用的声音。", + "cv_tts6": "选择要为发言者 6 应用的声音。", + "cv_tts7": "选择要为发言者 7 应用的声音。", + "cv_tts8": "选择要为发言者 8 应用的声音。", + "cv_tts9": "选择要为发言者 9 应用的声音。", + "cv_tts10": "选择要为发言者 10 应用的声音。", + "cv_tts11": "选择要为发言者 11 应用的声音。", + "cv_tts12": "选择要为发言者 12 应用的声音。", + "cv_aux": "- 在某种原因下未成功检测到发言者时应用的声音。", + "cv_button_apply": "应用配置", + "tab_help": "帮助", + }, + "ukrainian": { + "description": """ + ### 🎥 **Перекладайте відео легко з SoniTranslate!** 📽️ + + Завантажте відео, аудіофайл або надайте посилання на YouTube. 📽️ **Отримайте оновлений ноутбук з офіційного репозиторію: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Дивіться вкладку `Довідка` за інструкціями, як цим користуватися. Давайте почнемо веселощі з перекладу відео! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Інструкції з використання:** + + 1. 📤 Завантажте **відео**, **аудіофайл** або надайте 🌐 **посилання на YouTube**. + + 2. 🌍 Виберіть мову, на яку ви хочете **перекласти відео**. + + 3. 🗣️ Вкажіть **кількість людей, які говорять** у відео і **призначте кожному голосу для синтезу мовлення тексту**, що відповідає мові перекладу. + + 4. 🚀 Натисніть кнопку '**Переклад**', щоб отримати результати. + + --- + + # 🧩 **SoniTranslate підтримує різні двигуни TTS (текст-у-мову), які є:** + - EDGE-TTS → формат `en-AU-WilliamNeural-Male` → Швидкий та точний. + - FACEBOOK MMS → формат `en-facebook-mms VITS` → Голос більш натуральний; наразі використовується лише ЦП. + - PIPER TTS → формат `en_US-lessac-high VITS-onnx` → Те ж саме, що й попередній, але оптимізований як для ЦП, так і для ГПУ. + - BARK → формат `en_speaker_0-Male BARK` → Хороша якість, але повільна, і вона схильна до галюцинацій. + - OpenAI TTS → формат `>alloy OpenAI-TTS` → Мультиязычный, але потребує OpenAI API key + - Coqui XTTS → формат `_XTTS_/AUTOMATIC.wav` → Доступний лише для китайської (спрощеної), англійської, французької, німецької, італійської, португальської, польської, турецької, російської, голландської, чеської, арабської, іспанської, угорської, корейської та японської. + + --- + + # 🎤 Як використовувати голоси R.V.C. та R.V.C.2 (Необов'язково) 🎶 + + Мета - застосувати R.V.C. до створеного TTS (текст-у-мову) 🎙️ + + 1. У вкладці `Корист. голос R.V.C.` завантажте необхідні моделі 📥 Ви можете використовувати посилання з Hugging Face та Google Drive у форматах, таких як zip, pth або index. Ви також можете завантажити повні репозиторії просторів HF, але ця опція не дуже стабільна 😕 + + 2. Тепер перейдіть до `Заміна голосу: TTS на R.V.C.` та позначте поле `enable` ✅ Після цього ви можете вибрати моделі, які ви хочете застосувати до кожного говорця TTS 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Налаштуйте метод F0, який буде застосовуватися до всіх R.V.C. 🎛️ + + 4. Натисніть `ЗАСТОСУВАТИ КОНФІГУРАЦІЮ`, щоб застосувати зроблені зміни 🔄 + + 5. Поверніться до вкладки перекладу відео та натисніть на 'Переклад' ▶️ Тепер переклад буде виконаний з використанням R.V.C. 🗣️ + + Порада: Ви можете використовувати `Тест R.V.C.` для експериментування та знаходження найкращих TTS або конфігурацій для застосування до R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "Переклад відео", + "video_source": "Виберіть джерело відео", + "link_label": "Посилання на медіа.", + "link_info": "Приклад: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "Тут введіть URL...", + "dir_label": "Шлях до відео.", + "dir_info": "Приклад: /usr/home/my_video.mp4", + "dir_ph": "Тут введіть шлях...", + "sl_label": "Мова джерела", + "sl_info": "Це оригінальна мова відео", + "tat_label": "Переклад аудіо на", + "tat_info": "Виберіть цільову мову і також переконайтеся, що вибрали відповідний TTS для цієї мови.", + "num_speakers": "Виберіть, скільки людей говорить у відео.", + "min_sk": "Мін. говорці", + "max_sk": "Макс. говорці", + "tts_select": "Виберіть голос для кожного говорця.", + "sk1": "Говорець TTS 1", + "sk2": "Говорець TTS 2", + "sk3": "Говорець TTS 3", + "sk4": "Говорець TTS 4", + "sk5": "Говорець TTS 5", + "sk6": "Говорець TTS 6", + "sk7": "Говорець TTS 7", + "sk8": "Говорець TTS 8", + "sk9": "Говорець TTS 9", + "sk10": "Говорець TTS 10", + "sk11": "Говорець TTS 11", + "sk12": "Говорець TTS 12", + "vc_title": "Імітація голосу на різних мовах", + "vc_subtitle": """ + ### Відтворення голосу людини на різних мовах. + Хоча це ефективно з більшістю голосів при відповідному використанні, воно може не досягти ідеальності в кожному випадку. + Імітація голосу виключно відтворює тон джерела, не включаючи акцент і емоції, які контролюються базовою моделлю TTS говорця і не реплікуються конвертером. + Це займе аудіопроби з основного аудіо для кожного говорця та обробить їх. + """, + "vc_active_label": "Активна імітація голосу", + "vc_active_info": "Активна імітація голосу: Відтворює тон оригінального говорця", + "vc_method_label": "Метод", + "vc_method_info": "Виберіть метод для процесу імітації голосу", + "vc_segments_label": "Макс. проби", + "vc_segments_info": "Макс. проби: Кількість аудіопроб, які будуть згенеровані для процесу, більше - краще, але це може додати шум", + "vc_dereverb_label": "Прибрати реверберацію", + "vc_dereverb_info": "Прибрати реверберацію: Застосовує вокальну реверберацію до аудіопроб.", + "vc_remove_label": "Видалити попередні проби", + "vc_remove_info": "Видалити попередні проби: Видаляє попередні згенеровані проби, тому потрібно створити нові.", + "xtts_title": "Створити TTS на основі аудіо", + "xtts_subtitle": "Завантажте короткий аудіофайл максимум 10 секунд з голосом. Використовуючи XTTS, буде створений новий TTS з голосом, схожим на вказаний аудіофайл.", + "xtts_file_label": "Завантажте короткий аудіофайл з голосом", + "xtts_name_label": "Назва для TTS", + "xtts_name_info": "Використовуйте просту назву", + "xtts_dereverb_label": "Прибрати реверберацію з аудіо", + "xtts_dereverb_info": "Прибрати реверберацію з аудіо: Застосовує вокальну реверберацію до аудіо", + "xtts_button": "Обробити аудіо та включити його в селектор TTS", + "xtts_footer": "Автоматично генеруйте голосовий xtts: Ви можете використовувати `_XTTS_/AUTOMATIC.wav` у селекторі TTS, щоб автоматично генерувати сегменти для кожного говорця при генерації перекладу.", + "extra_setting": "Додаткові налаштування", + "acc_max_label": "Макс. прискорення аудіо", + "acc_max_info": "Максимальне прискорення для перекладених аудіосегментів для уникнення перекриття. Значення 1,0 означає відсутність прискорення", + "acc_rate_label": "Регулювання швидкості прискорення", + "acc_rate_info": "Регулювання швидкості прискорення: Налаштовує прискорення, щоб пристосуватися до сегментів, які потребують меншої швидкості, зберігаючи послідовність та враховуючи час наступного запуску.", + "or_label": "Зменшення перекриття", + "or_info": "Зменшення перекриття: Забезпечує відсутність перекриття сегментів за допомогою налаштування часу початку на основі попередніх часів завершення; може порушити синхронізацію.", + "aud_mix_label": "Метод мікшування аудіо", + "aud_mix_info": "Змішуйте оригінальні та перекладені аудіофайли, щоб створити налаштований, збалансований вихід з двома доступними режимами мікшування.", + "vol_ori": "Гучність оригінального аудіо", + "vol_tra": "Гучність перекладеного аудіо", + "voiceless_tk_label": "Безголосий трек", + "voiceless_tk_info": "Безголосий трек: Прибрати голоси оригінального аудіо перед його поєднанням з перекладеним аудіо.", + "sub_type": "Тип субтитрів", + "soft_subs_label": "М'які субтитри", + "soft_subs_info": "М'які субтитри: Додаткові субтитри, які глядачі можуть увімкнути або вимкнути під час перегляду відео.", + "burn_subs_label": "Підпалити субтитри", + "burn_subs_info": "Підпалити субтитри: Вбудувати субтитри у відео, зробивши їх постійною частиною візуального змісту.", + "whisper_title": "Налаштування транскрипції.", + "lnum_label": "Літералізація Чисел", + "lnum_info": "Літералізація Чисел: Заміна числових представлень на їх письмові еквіваленти в транскрипції.", + "scle_label": "Очищення Звуку", + "scle_info": "Очищення Звуку: Покращення голосів, видалення фонового шуму перед транскрипцією для максимальної точності відміток часу. Ця операція може зайняти час, особливо з довгими аудіофайлами.", + "sd_limit_label": "Обмеження тривалості сегменту", + "sd_limit_info": "Вкажіть максимальну тривалість (у секундах) для кожного сегменту. Аудіо буде оброблено за допомогою VAD, обмежуючи тривалість для кожного фрагменту сегменту.", + "asr_model_info": "Він перетворює усну мову на текст за допомогою моделі 'Whisper' за замовчуванням. Використовуйте власну модель, наприклад, введіть ім'я репозиторію 'BELLE-2/Belle-whisper-large-v3-zh' у розкривному списку, щоб використовувати китайську мову з налаштованою моделлю. Знайдіть налаштовані моделі на Hugging Face.", + "ctype_label": "Тип обчислення", + "ctype_info": "Вибір менших типів, таких як int8 або float16, може покращити продуктивність, зменшивши використання пам'яті та збільшивши обчислювальну пропускну здатність, але може пожертвувати точністю порівняно з більшими типами даних, такими як float32.", + "batchz_label": "Розмір пакету", + "batchz_info": "Зменшення розміру пакета заощаджує пам'ять, якщо у вашої GPU менше VRAM, і допомагає керувати проблемами нестачі пам'яті.", + "tsscale_label": "Масштаб сегментації тексту", + "tsscale_info": "Розділіть текст на сегменти за допомогою речень, слів або символів. Сегментація за словами та символами надає більшу деталізацію, корисну для субтитрів; вимкнення перекладу зберігає вихідну структуру.", + "srt_file_label": "Завантажте файл субтитрів SRT (використовуватиметься замість транскрипції Whisper)", + "divide_text_label": "Розділити текстові сегменти за допомогою:", + "divide_text_info": "(Експериментально) Введіть роздільник для розділення існуючих текстових сегментів на мові джерела. Інструмент ідентифікує випадки та створює нові сегменти відповідно. Вказуйте кілька роздільників, використовуючи |, наприклад: !|?|...|。", + "diarization_label": "Модель діаризації", + "tr_process_label": "Процес перекладу", + "out_type_label": "Тип виводу", + "out_name_label": "Ім'я файлу", + "out_name_info": "Назва вихідного файлу", + "task_sound_label": "Звук статусу завдання", + "task_sound_info": "Звук статусу завдання: Відтворює звукове сповіщення про завершення завдання або помилки під час виконання.", + "cache_label": "Отримати Прогрес", + "cache_info": "Отримати Прогрес: Продовжити процес з останньої контрольної точки.", + "preview_info": "Перегляд обрізає відео лише до 10 секунд для тестування. Будь ласка, деактивуйте його, щоб отримати повну тривалість відео.", + "edit_sub_label": "Редагувати згенеровані субтитри", + "edit_sub_info": "Редагувати згенеровані субтитри: Дозволяє виконувати переклад у 2 етапи. Спочатку за допомогою кнопки 'ОТРИМАТИ СУБТИТРИ ТА РЕДАГУВАТИ' ви отримуєте субтитри, щоб ви могли їх відредагувати, а потім за допомогою кнопки 'ПЕРЕКЛАСТИ' ви можете створити відео", + "button_subs": "ОТРИМАТИ СУБТИТРИ ТА РЕДАГУВАТИ", + "editor_sub_label": "Згенеровані субтитри", + "editor_sub_info": "Вільно редагуйте текст в згенерованих субтитрах тут. Ви можете вносити зміни в параметри інтерфейсу перед тим, як натиснути кнопку 'ПЕРЕКЛАСТИ', за винятком 'Мови джерела', 'Переклад аудіо на' та 'Макс. говорці', щоб уникнути помилок. Як тільки ви закінчите, натисніть кнопку 'ПЕРЕКЛАСТИ'.", + "editor_sub_ph": "Спочатку натисніть 'ОТРИМАТИ СУБТИТРИ ТА РЕДАГУВАТИ', щоб отримати субтитри", + "button_translate": "ПЕРЕКЛАСТИ", + "output_result_label": "ЗАВАНТАЖИТИ ПЕРЕКЛАДЕНЕ ВІДЕО", + "sub_ori": "Субтитри", + "sub_tra": "Перекладені субтитри", + "ht_token_info": "Один із важливих кроків - прийняти ліцензійну угоду для використання Pyannote. Вам потрібно мати обліковий запис на Hugging Face та прийняти ліцензію для використання моделей: https://huggingface.co/pyannote/speaker-diarization та https://huggingface.co/pyannote/segmentation. Отримайте свій КЛЮЧОВИЙ ТОКЕН тут: https://hf.co/settings/tokens", + "ht_token_ph": "Токен вставляється тут...", + "tab_docs": "Переклад документів", + "docs_input_label": "Виберіть джерело документа", + "docs_input_info": "Це може бути PDF, DOCX, TXT або текст", + "docs_source_info": "Це початкова мова тексту", + "chunk_size_label": "Максимальна кількість символів, яку оброблятиме TTS на кожному сегменті", + "chunk_size_info": "Значення 0 призначає динамічне і більш сумісне значення для TTS.", + "docs_button": "Почати місткування мови", + "cv_url_info": "Автоматично завантажте моделі R.V.C. за URL-адресою. Ви можете використовувати посилання з HuggingFace або Drive, а також включати кілька посилань, кожне з яких відокремлене комою. Приклад: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Замінити голос: TTS на R.V.C.", + "sec1_title": "### 1. Для включення його використання відмітьте його як enable.", + "enable_replace": "Позначте це, щоб увімкнути використання моделей.", + "sec2_title": "### 2. Виберіть голос, який буде застосований до кожного TTS кожного відповідного говорця та застосуйте конфігурації.", + "sec2_subtitle": "Залежно від того, скільки ви будете використовувати, кожен з них потребує відповідної моделі. Крім того, є допоміжна, якщо з якихось причин говорець не розпізнається правильно.", + "cv_tts1": "Виберіть голос для застосування до говорця 1.", + "cv_tts2": "Виберіть голос для застосування до говорця 2.", + "cv_tts3": "Виберіть голос для застосування до говорця 3.", + "cv_tts4": "Виберіть голос для застосування до говорця 4.", + "cv_tts5": "Виберіть голос для застосування до говорця 5.", + "cv_tts6": "Виберіть голос для застосування до говорця 6.", + "cv_tts7": "Виберіть голос для застосування до говорця 7.", + "cv_tts8": "Виберіть голос для застосування до говорця 8.", + "cv_tts9": "Виберіть голос для застосування до говорця 9.", + "cv_tts10": "Виберіть голос для застосування до говорця 10.", + "cv_tts11": "Виберіть голос для застосування до говорця 11.", + "cv_tts12": "Виберіть голос для застосування до говорця 12.", + "cv_aux": "- Голос, який застосовується у разі невдалого розпізнавання говорця.", + "cv_button_apply": "ЗАСТОСУВАТИ КОНФІГУРАЦІЮ", + "tab_help": "Довідка", + }, + "arabic": { + "description": """ + ### 🎥 **ترجمة مقاطع الفيديو بسهولة مع SoniTranslate!** 📽️ + + قم بتحميل ملف فيديو أو صوتي أو قدم رابطًا لفيديو YouTube. 📽️ **احصل على الدفتر المحدث من المستودع الرسمي: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + انظر إلى علامة التبويب "المساعدة" للحصول على تعليمات حول كيفية استخدامه. لنبدأ بالمرح مع ترجمة الفيديو! 🚀🎉 + """, + "tutorial": """ + # 🔰 **تعليمات الاستخدام:** + + 1. 📤 قم بتحميل **فيديو** أو ملف **صوتي** أو قم بتقديم 🌐 **رابط YouTube.** + + 2. 🌍 اختر اللغة التي ترغب في **ترجمة الفيديو** إليها. + + 3. 🗣️ حدد **عدد الأشخاص الذين يتحدثون** في الفيديو و **تعيين كل واحد منهم صوتًا للنص إلى الكلام** مناسبًا للغة الترجمة. + + 4. 🚀 اضغط على زر '**ترجمة**' للحصول على النتائج. + + --- + + # 🧩 **يدعم SoniTranslate محركات TTS (نص إلى كلام) مختلفة، وهي:** + - EDGE-TTS → الصيغة `en-AU-WilliamNeural-Male` → سريع ودقيق. + - FACEBOOK MMS → الصيغة `en-facebook-mms VITS` → الصوت أكثر طبيعية؛ في الوقت الحالي، يستخدم فقط وحدة المعالجة المركزية. + - PIPER TTS → الصيغة `en_US-lessac-high VITS-onnx` → نفس الشيء كما السابق، ولكنه محسّن لكل من وحدة المعالجة المركزية ووحدة معالجة الرسومات. + - BARK → الصيغة `en_speaker_0-Male BARK` → جودة جيدة ولكن بطيء، ويميل إلى التهليل. + - OpenAI TTS → الصيغة `>alloy OpenAI-TTS` → متعدد اللغات ولكن يتطلب OpenAI API key + - Coqui XTTS → الصيغة `_XTTS_/AUTOMATIC.wav` → متاحة فقط للصينية (المبسطة)، الإنجليزية، الفرنسية، الألمانية، الإيطالية، البرتغالية، البولندية، التركية، الروسية، الهولندية، التشيكية، العربية، الإسبانية، الهنغارية، الكورية واليابانية. + + --- + + # 🎤 كيفية استخدام أصوات R.V.C. و R.V.C.2 (اختياري) 🎶 + + الهدف هو تطبيق صوت R.V.C. على TTS المولد (نص إلى كلام) 🎙️ + + 1. في علامة التبويب "الصوت المخصص R.V.C."، قم بتنزيل النماذج التي تحتاجها 📥 يمكنك استخدام روابط من Hugging Face وGoogle Drive بتنسيقات مثل zip أو pth أو index. يمكنك أيضًا تنزيل مستودعات مساحة HF الكاملة، ولكن هذا الخيار غير مستقر جدًا 😕 + + 2. الآن، انتقل إلى "Replace voice: TTS to R.V.C." وحدد مربع "تمكين" ✅ بعد ذلك، يمكنك اختيار النماذج التي تريد تطبيقها على كل متحدث TTS 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. ضبط طريقة F0 التي ستُطبَّق على جميع R.V.C. 🎛️ + + 4. اضغط على `APPLY CONFIGURATION` لتطبيق التغييرات التي قمت بها 🔄 + + 5. ارجع إلى علامة التبويب لترجمة الفيديو وانقر فوق 'ترجمة' ▶️ الآن، سيتم إجراء الترجمة بتطبيق R.V.C. 🗣️ + + نصيحة: يمكنك استخدام `Test R.V.C.` لتجربة والعثور على أفضل TTS أو التكوينات لتطبيق R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "ترجمة الفيديو", + "video_source": "اختر مصدر الفيديو", + "link_label": "رابط الوسائط.", + "link_info": "مثال: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "يتم إدخال الرابط هنا...", + "dir_label": "مسار الفيديو.", + "dir_info": "مثال: /usr/home/my_video.mp4", + "dir_ph": "يتم إدخال المسار هنا...", + "sl_label": "اللغة المصدر", + "sl_info": "هذه هي اللغة الأصلية للفيديو", + "tat_label": "ترجمة الصوت إلى", + "tat_info": "حدد اللغة المستهدفة وتأكد أيضًا من اختيار TTS المقابل لتلك اللغة.", + "num_speakers": "حدد كم عدد الأشخاص الذين يتحدثون في الفيديو.", + "min_sk": "الحد الأدنى من الأشخاص", + "max_sk": "الحد الأقصى من الأشخاص", + "tts_select": "اختر الصوت الذي تريده لكل متحدث.", + "sk1": "متحدث TTS 1", + "sk2": "متحدث TTS 2", + "sk3": "متحدث TTS 3", + "sk4": "متحدث TTS 4", + "sk5": "متحدث TTS 5", + "sk6": "متحدث TTS 6", + "sk7": "متحدث TTS 7", + "sk8": "متحدث TTS 8", + "sk9": "متحدث TTS 9", + "sk10": "متحدث TTS 10", + "sk11": "متحدث TTS 11", + "sk12": "متحدث TTS 12", + "vc_title": "تقليد صوت في لغات مختلفة", + "vc_subtitle": """ + ### استنساخ صوت الشخص عبر لغات متعددة. + على الرغم من كفاءته مع معظم الأصوات عند استخدامه بشكل مناسب، قد لا يتم التمام في كل حالة. + يقوم تقليد الصوت بالتمثيل فقط لنغمة المتحدث الأصلي، باستثناء اللكنة والعاطفة، التي تحكمها نموذج TTS الأصلي ولا يقوم المحول بتكرارها. + سيتم أخذ عينات الصوت من الصوت الرئيسي لكل متحدث ومعالجتها. + """, + "vc_active_label": "تقليد صوت نشط", + "vc_active_info": "تقليد صوت نشط: يقوم بتمثيل نغمة المتحدث الأصلي", + "vc_method_label": "الطريقة", + "vc_method_info": "حدد طريقة لعملية تقليد الصوت", + "vc_segments_label": "الحد الأقصى للعينات", + "vc_segments_info": "الحد الأقصى للعينات: هو عدد عينات الصوت التي سيتم إنشاؤها للعملية، كلما كانت أكثر كانت أفضل ولكن يمكن أن تضيف ضوضاء", + "vc_dereverb_label": "إزالة الصدى", + "vc_dereverb_info": "إزالة الصدى: يُطبَّق تقنية إزالة الصدى الصوتي على عينات الصوت.", + "vc_remove_label": "إزالة العينات السابقة", + "vc_remove_info": "إزالة العينات السابقة: قم بإزالة العينات السابقة التي تم إنشاؤها، لذلك يجب إنشاء عينات جديدة.", + "xtts_title": "إنشاء TTS استنادًا إلى صوت", + "xtts_subtitle": "قم بتحميل ملف صوتي لمدة 10 ثوانٍ كحد أقصى بصوت. باستخدام XTTS، سيتم إنشاء TTS جديد بصوت مشابه للملف الصوتي المقدم.", + "xtts_file_label": "قم بتحميل ملف صوتي قصير بالصوت", + "xtts_name_label": "اسم لـ TTS", + "xtts_name_info": "استخدم اسمًا بسيطًا", + "xtts_dereverb_label": "إزالة صدى الصوت", + "xtts_dereverb_info": "إزالة صدى الصوت: يُطبَّق تقنية إزالة صدى الصوت على الصوت", + "xtts_button": "معالجة الصوت وتضمينه في محدد TTS", + "xtts_footer": "توليد صوت xtts تلقائيًا: يمكنك استخدام `_XTTS_/AUTOMATIC.wav` في محدد TTS لتوليد تقاطعات لكل متحدث تلقائيًا عند إنشاء الترجمة.", + "extra_setting": "إعدادات متقدمة", + "acc_max_label": "التسارع الصوتي الأقصى", + "acc_max_info": "التسارع الأقصى لقطع الصوت المترجم لتجنب التداخل. قيمة 1.0 تمثل عدم وجود تسارع", + "acc_rate_label": "تنظيم معدل التسارع", + "acc_rate_info": "تنظيم معدل التسارع: يعدل التسارع لتوفير مقاطع تتطلب سرعة أقل، مع الحفاظ على الاستمرارية واعتبار توقيت البدء التالي.", + "or_label": "تقليل التداخل", + "or_info": "تقليل التداخل: يضمن عدم تداخل الشرائح عن طريق ضبط أوقات البدء استنادًا إلى الأوقات السابقة للنهاية ؛ قد يؤدي إلى إختلال التزامن.", + "aud_mix_label": "طريقة مزج الصوت", + "aud_mix_info": "مزج ملفات الصوت الأصلية والمترجمة لإنشاء إخراج مخصص ومتوازن بوجود طريقتي مزج متاحتين.", + "vol_ori": "مستوى صوت الصوت الأصلي", + "vol_tra": "مستوى صوت الصوت المترجم", + "voiceless_tk_label": "مسار بدون صوت", + "voiceless_tk_info": "مسار بدون صوت: قم بإزالة الأصوات الصوتية الأصلية قبل دمجها مع الصوت المترجم.", + "sub_type": "نوع العنوان الفرعي", + "soft_subs_label": "ترجمة نصية ناعمة", + "soft_subs_info": "ترجمة نصية ناعمة: ترجمات نصية اختيارية يمكن للمشاهدين تشغيلها أو إيقافها أثناء مشاهدة الفيديو.", + "burn_subs_label": "حرق الترجمة الفرعية", + "burn_subs_info": "حرق الترجمة الفرعية: تضمين الترجمة الفرعية في الفيديو، مما يجعلها جزءًا دائمًا من المحتوى البصري.", + "whisper_title": "تكوين النص السريع.", + "lnum_label": "تحويل الأرقام إلى كلمات", + "lnum_info": "تحويل الأرقام إلى كلمات: استبدال التمثيلات الرقمية بمكافآتها المكتوبة في النص المكتوب.", + "scle_label": "تنظيف الصوت", + "scle_info": "تنظيف الصوت: تعزيز الأصوات، إزالة الضجيج الخلفي قبل التفريغ للحصول على أقصى دقة في الطابع الزمني. قد تستغرق هذه العملية وقتًا، خاصة مع ملفات الصوت الطويلة.", + "sd_limit_label": "حد مدة القطعة", + "sd_limit_info": "حدد المدة القصوى (بالثواني) لكل قطعة. سيتم معالجة الصوت باستخدام VAD، محددة مدة كل قطعة.", + "asr_model_info": "يحول اللغة الحية إلى نص باستخدام نموذج 'الهمس' افتراضيًا. استخدم نموذجًا مخصصًا، على سبيل المثال، عن طريق إدخال اسم المستودع 'BELLE-2/Belle-whisper-large-v3-zh' في القائمة المنسدلة لاستخدام نموذج معدل باللغة الصينية. العثور على النماذج المعدلة على Hugging Face.", + "ctype_label": "نوع الحساب", + "ctype_info": "اختيار أنواع أصغر مثل int8 أو float16 يمكن أن يحسن الأداء من خلال تقليل استخدام الذاكرة وزيادة الإخراج الحسابي، ولكن قد يضحي بالدقة مقارنة بأنواع البيانات الأكبر مثل float32.", + "batchz_label": "حجم الدفعة", + "batchz_info": "توفير الذاكرة عن طريق تقليل حجم الدفعة إذا كان لديك بطاقة رسومات GPU تحتوي على VRAM أقل وتساعد في إدارة مشكلات الذاكرة النفاد.", + "tsscale_label": "مقياس تقسيم النص", + "tsscale_info": "تقسيم النص إلى قطع حسب الجمل أو الكلمات أو الأحرف. يوفر تقسيم الكلمات والأحرف دقة أكبر، وهو مفيد للترجمات الفورية؛ يحافظ تعطيل الترجمة على الهيكل الأصلي.", + "srt_file_label": "قم بتحميل ملف عنوان فرعي SRT (سيُستخدم بدلاً من النص السريع)", + "divide_text_label": "إعادة تقسيم شرائح النص بواسطة:", + "divide_text_info": "(تجريبي) أدخل فاصل لتقسيم شرائح النص الحالية في اللغة المصدر. ستحدد الأداة حدوث الحالات وإنشاء شرائح جديدة وفقًا لذلك. حدد علامات فاصلة متعددة باستخدام |، على سبيل المثال: !|؟|...|。", + "diarization_label": "نموذج توثيق الصوت", + "tr_process_label": "عملية الترجمة", + "out_type_label": "نوع الإخراج", + "out_name_label": "اسم الملف", + "out_name_info": "اسم الملف الناتج", + "task_sound_label": "صوت حالة المهمة", + "task_sound_info": "صوت حالة المهمة: يشغل تنبيه صوتي يشير إلى اكتمال المهمة أو الأخطاء أثناء التنفيذ.", + "cache_label": "استعادة التقدم", + "cache_info": "استعادة التقدم: متابعة العملية من نقطة التفتيش الأخيرة.", + "preview_info": "يقوم المعاينة بتقطيع الفيديو لمدة 10 ثوانٍ فقط لأغراض الاختبار. يرجى إلغاء تنشيطه لاسترداد مدة الفيديو الكاملة.", + "edit_sub_label": "تحرير العناوين الفرعية المولدة", + "edit_sub_info": "تحرير العناوين الفرعية المولدة: يتيح لك تشغيل الترجمة في 2 خطوة. أولاً بزر 'GET SUBTITLES AND EDIT'، يتم الحصول على العناوين الفرعية لتحريرها، ومن ثم بزر 'TRANSLATE'، يمكنك إنشاء الفيديو", + "button_subs": "GET SUBTITLES AND EDIT", + "editor_sub_label": "العناوين الفرعية المولدة", + "editor_sub_info": "لا تتردد في تحرير النص في العناوين الفرعية المولدة هنا. يمكنك إجراء تغييرات على خيارات الواجهة قبل النقر على زر 'TRANSLATE'، باستثناء 'اللغة المصدر' و 'ترجمة الصوت إلى' و 'الحد الأقصى للأشخاص'، لتجنب الأخطاء. بمجرد الانتهاء، انقر فوق الزر 'TRANSLATE'.", + "editor_sub_ph": "اضغط أولاً على 'GET SUBTITLES AND EDIT' للحصول على العناوين الفرعية", + "button_translate": "TRANSLATE", + "output_result_label": "تنزيل الفيديو المترجم", + "sub_ori": "العناوين الفرعية", + "sub_tra": "العناوين الفرعية المترجمة", + "ht_token_info": "خطوة مهمة هي قبول اتفاقية الترخيص لاستخدام Pyannote. يجب أن تكون لديك حساب على Hugging Face وقبول الترخيص لاستخدام النماذج: https://huggingface.co/pyannote/speaker-diarization و https://huggingface.co/pyannote/segmentation. احصل على مفتاحك الخاص هنا: https://hf.co/settings/tokens", + "ht_token_ph": "يتم إدخال المفتاح هنا...", + "tab_docs": "ترجمة المستندات", + "docs_input_label": "اختر مصدر المستند", + "docs_input_info": "يمكن أن يكون PDF، DOCX، TXT، أو نص", + "docs_source_info": "هذه هي اللغة الأصلية للنص", + "chunk_size_label": "الحد الأقصى لعدد الأحرف التي سيعالجها TTS في كل شريحة", + "chunk_size_info": "تُخصص قيمة 0 قيمة ديناميكية وأكثر توافقًا لـ TTS.", + "docs_button": "بدء جسر تحويل اللغة", + "cv_url_info": "قم بتنزيل نماذج R.V.C. تلقائيًا من الرابط. يمكنك استخدام روابط من HuggingFace أو Drive، ويمكنك تضمين عدة روابط، مفصولة بفاصلة. على سبيل المثال: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth، https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "استبدال الصوت: TTS إلى R.V.C.", + "sec1_title": "### 1. لتمكين استخدامه، ضع علامة على تمكينه.", + "enable_replace": "تحقق من ذلك لتمكين استخدام النماذج.", + "sec2_title": "### 2. اختر صوتًا سيتم تطبيقه على كل TTS لكل متحدث مقابل وطبق التكوينات.", + "sec2_subtitle": "يعتمد ذلك على عدد <متحدث TTS> الذي ستستخدمه، ويحتاج كل منها إلى نموذجه الخاص. بالإضافة إلى ذلك، هناك واحدة مساعدة في حالة عدم اكتشاف المتحدث بشكل صحيح لأي سبب ما.", + "cv_tts1": "اختر الصوت المراد تطبيقه على المتحدث 1.", + "cv_tts2": "اختر الصوت المراد تطبيقه على المتحدث 2.", + "cv_tts3": "اختر الصوت المراد تطبيقه على المتحدث 3.", + "cv_tts4": "اختر الصوت المراد تطبيقه على المتحدث 4.", + "cv_tts5": "اختر الصوت المراد تطبيقه على المتحدث 5.", + "cv_tts6": "اختر الصوت المراد تطبيقه على المتحدث 6.", + "cv_tts7": "اختر الصوت المراد تطبيقه على المتحدث 7.", + "cv_tts8": "اختر الصوت المراد تطبيقه على المتحدث 8.", + "cv_tts9": "اختر الصوت المراد تطبيقه على المتحدث 9.", + "cv_tts10": "اختر الصوت المراد تطبيقه على المتحدث 10.", + "cv_tts11": "اختر الصوت المراد تطبيقه على المتحدث 11.", + "cv_tts12": "اختر الصوت المراد تطبيقه على المتحدث 12.", + "cv_aux": "- الصوت المراد تطبيقه في حالة عدم اكتشاف المتحدث بنجاح.", + "cv_button_apply": "تطبيق التكوين", + "tab_help": "مساعدة", + }, + "russian": { + "description": """ + ### 🎥 **Перевод видео легко с SoniTranslate!** 📽️ + + Загрузите видео, аудиофайл или предоставьте ссылку на YouTube. 📽️ **Получите обновленный блокнот из официального репозитория.: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Посмотрите вкладку `Помощь` для инструкций о том, как это использовать. Давайте начнем веселиться с переводом видео! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Инструкции по использованию:** + + 1. 📤 Загрузите **видео**, **аудиофайл** или предоставьте 🌐 **ссылку на YouTube**. + + 2. 🌍 Выберите язык, на который вы хотите **перевести видео**. + + 3. 🗣️ Укажите **количество говорящих** в видео и **назначьте каждому голос синтеза речи** подходящий для языка перевода. + + 4. 🚀 Нажмите кнопку '**Перевести**', чтобы получить результаты. + + --- + + # 🧩 **SoniTranslate поддерживает различные движки TTS (текст в речь), которые включают:** + - EDGE-TTS → формат `en-AU-WilliamNeural-Male` → Быстро и точно. + - FACEBOOK MMS → формат `en-facebook-mms VITS` → Голос более естественный; на данный момент используется только процессор. + - PIPER TTS → формат `en_US-lessac-high VITS-onnx` → То же самое, что и предыдущее, но оптимизировано как для CPU, так и для GPU. + - BARK → формат `en_speaker_0-Male BARK` → Хорошее качество, но медленное, и оно подвержено галлюцинациям. + - OpenAI TTS → формат `>alloy OpenAI-TTS` → Многоязычный, но требуется OpenAI API key + - Coqui XTTS → формат `_XTTS_/AUTOMATIC.wav` → Доступен только для китайского (упрощенного), английского, французского, немецкого, итальянского, португальского, польского, турецкого, русского, голландского, чешского, арабского, испанского, венгерского, корейского и японского языков. + + --- + + # 🎤 Как использовать голоса R.V.C. и R.V.C.2 (необязательно) 🎶 + + Цель - применить R.V.C. к созданному TTS (текст в речь) 🎙️ + + 1. На вкладке `Настройка пользовательского голоса R.V.C.` загрузите необходимые модели 📥 Можно использовать ссылки из Hugging Face и Google Drive в форматах zip, pth или index. Вы также можете загрузить полные репозитории HF space, но эта опция не очень стабильна 😕 + + 2. Теперь перейдите в раздел `Заменить голос: TTS на R.V.C.` и установите флажок `включить` ✅ После этого вы сможете выбрать модели, которые хотите применить к каждому говорителю TTS 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Настройте метод F0, который будет применен ко всем R.V.C. 🎛️ + + 4. Нажмите `ПРИМЕНИТЬ КОНФИГУРАЦИЮ`, чтобы применить внесенные вами изменения 🔄 + + 5. Вернитесь на вкладку перевода видео и нажмите 'Перевести' ▶️ Теперь перевод будет выполнен с применением R.V.C. 🗣️ + + Совет: Вы можете использовать `Тест R.V.C.` для экспериментов и поиска лучших TTS или конфигураций для применения к R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "Перевод видео", + "video_source": "Выберите источник видео", + "link_label": "Ссылка на медиа.", + "link_info": "Пример: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "Сюда вставьте URL...", + "dir_label": "Путь к видео.", + "dir_info": "Пример: /usr/home/my_video.mp4", + "dir_ph": "Сюда вставьте путь...", + "sl_label": "Исходный язык", + "sl_info": "Это оригинальный язык видео", + "tat_label": "Перевести аудио на", + "tat_info": "Выберите целевой язык и также убедитесь, что выбран соответствующий TTS для этого языка.", + "num_speakers": "Выберите, сколько людей говорят в видео.", + "min_sk": "Мин. количество говорящих", + "max_sk": "Макс. количество говорящих", + "tts_select": "Выберите голос для каждого говорящего.", + "sk1": "Говорящий 1 (TTS)", + "sk2": "Говорящий 2 (TTS)", + "sk3": "Говорящий 3 (TTS)", + "sk4": "Говорящий 4 (TTS)", + "sk5": "Говорящий 5 (TTS)", + "sk6": "Говорящий 6 (TTS)", + "sk7": "Говорящий 7 (TTS)", + "sk8": "Говорящий 8 (TTS)", + "sk9": "Говорящий 9 (TTS)", + "sk10": "Говорящий 10 (TTS)", + "sk11": "Говорящий 11 (TTS)", + "sk12": "Говорящий 12 (TTS)", + "vc_title": "Имитация голоса на разных языках", + "vc_subtitle": """ + ### Воспроизведение голоса человека на разных языках. + Несмотря на то, что оно эффективно с большинством голосов при правильном использовании, в некоторых случаях оно может не достигать идеальности. + Имитация голоса полностью воспроизводит тон референсного диктора, исключая акцент и эмоции, которые контролируются базовой моделью TTS диктора и не воспроизводятся конвертером. + Это займет аудиосэмплы из основного аудио для каждого говорящего и обработает их. + """, + "vc_active_label": "Активировать имитацию голоса", + "vc_active_info": "Активировать имитацию голоса: Воспроизводит тон оригинального говорящего", + "vc_method_label": "Метод", + "vc_method_info": "Выберите метод для процесса имитации голоса", + "vc_segments_label": "Макс. количество сэмплов", + "vc_segments_info": "Максимальное количество сэмплов: это количество аудиосэмплов, которые будут сгенерированы для процесса, чем их больше, тем лучше, но это может добавить шум", + "vc_dereverb_label": "Удалить реверберацию", + "vc_dereverb_info": "Удалить реверберацию: Применяет вокальную реверберацию к аудиосэмплам.", + "vc_remove_label": "Удалить предыдущие сэмплы", + "vc_remove_info": "Удалить предыдущие сэмплы: Удаляет предыдущие сгенерированные сэмплы, поэтому нужно создавать новые.", + "xtts_title": "Создание TTS на основе аудио", + "xtts_subtitle": "Загрузите аудиофайл максимум на 10 секунд с голосом. Используя XTTS, будет создан новый TTS с голосом, аналогичным предоставленному аудиофайлу.", + "xtts_file_label": "Загрузить короткое аудио с голосом", + "xtts_name_label": "Название для TTS", + "xtts_name_info": "Используйте простое название", + "xtts_dereverb_label": "Удалить реверберацию аудио", + "xtts_dereverb_info": "Удалить реверберацию аудио: Применяет вокальную реверберацию к аудио", + "xtts_button": "Обработать аудио и включить его в селектор TTS", + "xtts_footer": "Генерировать голосовой XTTS автоматически: Вы можете использовать `_XTTS_/AUTOMATIC.wav` в селекторе TTS для автоматической генерации сегментов для каждого говорящего при создании перевода.", + "extra_setting": "Дополнительные настройки", + "acc_max_label": "Макс. ускорение аудио", + "acc_max_info": "Максимальное ускорение для переведенных аудиосегментов для избежания их перекрытия. Значение 1.0 означает отсутствие ускорения", + "acc_rate_label": "Регулирование уровня ускорения", + "acc_rate_info": "Регулирование уровня ускорения: Регулирует ускорение для адаптации к сегментам, требующим меньшей скорости, сохраняя непрерывность и учитывая временные параметры следующего запуска.", + "or_label": "Сокращение перекрытий", + "or_info": "Сокращение перекрытий: Обеспечивает отсутствие перекрытия сегментов путем корректировки времени начала на основе предыдущих времен завершения; может нарушить синхронизацию.", + "aud_mix_label": "Метод смешивания аудио", + "aud_mix_info": "Смешивание оригинальных и переведенных аудиофайлов для создания настраиваемого, сбалансированного вывода с двумя доступными режимами смешивания.", + "vol_ori": "Громкость оригинального аудио", + "vol_tra": "Громкость переведенного аудио", + "voiceless_tk_label": "Безголосовая дорожка", + "voiceless_tk_info": "Безголосовая дорожка: Удалить голоса оригинального аудио перед его смешиванием с переведенным аудио.", + "sub_type": "Тип субтитров", + "soft_subs_label": "Мягкие субтитры", + "soft_subs_info": "Мягкие субтитры: Дополнительные субтитры, которые зрители могут включать или выключать во время просмотра видео.", + "burn_subs_label": "Вжечь субтитры", + "burn_subs_info": "Вжечь субтитры: Внедрить субтитры в видео, сделав их постоянной частью визуального контента.", + "whisper_title": "Конфигурация транскрипции.", + "lnum_label": "Литерализация Чисел", + "lnum_info": "Литерализация Чисел: Замена числовых представлений их письменными эквивалентами в транскрипции.", + "scle_label": "Очистка Звука", + "scle_info": "Очистка Звука: Улучшение голосов, удаление фонового шума перед транскрипцией для максимальной точности временных меток. Эта операция может занять время, особенно с длинными аудиофайлами.", + "sd_limit_label": "Ограничение Длительности Сегмента", + "sd_limit_info": "Укажите максимальную длительность (в секундах) для каждого сегмента. Аудио будет обработано с использованием VAD, ограничивая длительность для каждого фрагмента сегмента.", + "asr_model_info": "Он преобразует устную речь в текст с использованием модели 'Whisper' по умолчанию. Используйте пользовательскую модель, например, введите имя репозитория 'BELLE-2/Belle-whisper-large-v3-zh' в выпадающем списке, чтобы использовать китайскую модель. Найдите настроенные модели на Hugging Face.", + "ctype_label": "Тип вычисления", + "ctype_info": "Выбор меньших типов, таких как int8 или float16, может улучшить производительность за счет уменьшения использования памяти и увеличения вычислительного потока, но может пожертвовать точностью по сравнению с более крупными типами данных, такими как float32.", + "batchz_label": "Размер Пакета", + "batchz_info": "Уменьшение размера пакета экономит память, если у вашей GPU меньше VRAM, и помогает управлять проблемами с памятью.", + "tsscale_label": "Масштабирование сегментации текста", + "tsscale_info": "Разделите текст на сегменты по предложениям, словам или символам. Сегментация по словам и символам обеспечивает более точную гранулярность, полезную для субтитров; отключение перевода сохраняет исходную структуру.", + "srt_file_label": "Загрузить файл субтитров в формате SRT (будет использоваться вместо транскрипции Whisper)", + "divide_text_label": "Разделить текстовые сегменты по:", + "divide_text_info": "(Экспериментально) Введите разделитель для разделения существующих текстовых сегментов на исходном языке. Инструмент определит вхождения и создаст новые сегменты в соответствии с ними. Укажите несколько разделителей, используя |, например: !|?|...|。", + "diarization_label": "Модель диаризации", + "tr_process_label": "Процесс перевода", + "out_type_label": "Тип вывода", + "out_name_label": "Имя файла", + "out_name_info": "Название выходного файла", + "task_sound_label": "Звук статуса задачи", + "task_sound_info": "Звук статуса задачи: Воспроизводит звуковой сигнал, указывающий на завершение задачи или ошибки во время выполнения.", + "cache_label": "Восстановление прогресса", + "cache_info": "Восстановление прогресса: Продолжить процесс с последней контрольной точки.", + "preview_info": "Предпросмотр обрезает видео до 10 секунд только для тестовых целей. Пожалуйста, отключите его, чтобы получить полную продолжительность видео.", + "edit_sub_label": "Редактировать сгенерированные субтитры", + "edit_sub_info": "Редактировать сгенерированные субтитры: Позволяет выполнять перевод в 2 этапа. Сначала нажмите кнопку 'ПОЛУЧИТЬ СУБТИТРЫ И РЕДАКТИРОВАТЬ', чтобы получить субтитры и отредактировать их, а затем с помощью кнопки 'ПЕРЕВЕСТИ' вы можете сгенерировать видео", + "button_subs": "ПОЛУЧИТЬ СУБТИТРЫ И РЕДАКТИРОВАТЬ", + "editor_sub_label": "Сгенерированные субтитры", + "editor_sub_info": "Не стесняйтесь редактировать текст в сгенерированных субтитрах здесь. Вы можете вносить изменения в параметры интерфейса перед нажатием кнопки 'ПЕРЕВЕСТИ', за исключением 'Исходный язык', 'Перевести аудио на' и 'Макс. количество говорящих', чтобы избежать ошибок. Как только закончите, нажмите кнопку 'ПЕРЕВЕСТИ'.", + "editor_sub_ph": "Сначала нажмите 'ПОЛУЧИТЬ СУБТИТРЫ И РЕДАКТИРОВАТЬ', чтобы получить субтитры", + "button_translate": "ПЕРЕВЕСТИ", + "output_result_label": "СКАЧАТЬ ПЕРЕВЕДЕННОЕ ВИДЕО", + "sub_ori": "Субтитры", + "sub_tra": "Переведенные субтитры", + "ht_token_info": "Один из важных шагов - принятие лицензионного соглашения на использование Pyannote. Вам нужно иметь учетную запись на Hugging Face и принять лицензию, чтобы использовать модели: https://huggingface.co/pyannote/speaker-diarization и https://huggingface.co/pyannote/segmentation. Получите свой КЛЮЧ ТОКЕН здесь: https://hf.co/settings/tokens", + "ht_token_ph": "Сюда вставьте токен...", + "tab_docs": "Перевод документов", + "docs_input_label": "Выберите источник документа", + "docs_input_info": "Это может быть PDF, DOCX, TXT или текст", + "docs_source_info": "Это оригинальный язык текста", + "chunk_size_label": "Макс. количество символов, которые будет обрабатывать TTS для каждого сегмента", + "chunk_size_info": "Значение 0 назначает динамическое и более совместимое значение для TTS.", + "docs_button": "Запустить мост перевода языка", + "cv_url_info": "Автоматическая загрузка моделей R.V.C. по URL. Можно использовать ссылки из HuggingFace или Drive, и можно включить несколько ссылок, каждую разделенную запятой. Пример: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Заменить голос: TTS на R.V.C.", + "sec1_title": "### 1. Чтобы активировать его использование, укажите его как включенный.", + "enable_replace": "Установите флажок, чтобы включить использование моделей.", + "sec2_title": "### 2. Выберите голос, который будет применен к каждому TTS каждого соответствующего говорящего и примените конфигурации.", + "sec2_subtitle": "В зависимости от того, сколько вы будете использовать, каждый из них нуждается в своей соответствующей модели. Кроме того, есть вспомогательная, если по какой-то причине говорящий не был определен правильно.", + "cv_tts1": "Выберите голос для применения для Говорящего 1.", + "cv_tts2": "Выберите голос для применения для Говорящего 2.", + "cv_tts3": "Выберите голос для применения для Говорящего 3.", + "cv_tts4": "Выберите голос для применения для Говорящего 4.", + "cv_tts5": "Выберите голос для применения для Говорящего 5.", + "cv_tts6": "Выберите голос для применения для Говорящего 6.", + "cv_tts7": "Выберите голос для применения для Говорящего 7.", + "cv_tts8": "Выберите голос для применения для Говорящего 8.", + "cv_tts9": "Выберите голос для применения для Говорящего 9.", + "cv_tts10": "Выберите голос для применения для Говорящего 10.", + "cv_tts11": "Выберите голос для применения для Говорящего 11.", + "cv_tts12": "Выберите голос для применения для Говорящего 12.", + "cv_aux": "- Голос, который будет применен в случае успешного неопределения говорящего.", + "cv_button_apply": "ПРИМЕНИТЬ КОНФИГУРАЦИЮ", + "tab_help": "Помощь", + }, + "turkish": { + "description": """ + ### 🎥 **SoniTranslate ile videoları kolayca çevirin!** 📽️ + + Bir video yükleyin, ses dosyası ekleyin veya bir YouTube bağlantısı sağlayın. 📽️ **Güncellenmiş notebook'ı resmi depodan alın: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Kullanım talimatları için 'Yardım' sekmesine bakın. Video çevirisi yapmaya başlayalım! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Kullanım Talimatları:** + + 1. 📤 Bir **video**, **ses dosyası** yükleyin veya bir 🌐 **YouTube bağlantısı sağlayın.** + + 2. 🌍 **Videodaki metni çevirmek istediğiniz dili seçin.** + + 3. 🗣️ Videodaki **konuşan kişi sayısını belirtin** ve her birine çeviri dili için uygun bir metin-okuma-sesini atayın. + + 4. 🚀 Sonuçları elde etmek için '**Çevir**' düğmesine basın. + + --- + + # 🧩 **SoniTranslate, farklı TTS (Metin-okuma-sesi) motorlarını destekler, bunlar:** + - EDGE-TTS → biçim `tr-TR-ZeynepNeural-Kadın` → Hızlı ve doğru. + - FACEBOOK MMS → biçim `tr-facebook-mms VITS` → Ses daha doğal; şu anda yalnızca CPU kullanıyor. + - PIPER TTS → biçim `tr_TR-lessac-high VITS-onnx` → Öncekiyle aynı, ancak hem CPU hem de GPU için optimize edilmiştir. + - BARK → biçim `tr_speaker_0-Kadın BARK` → İyi kalite ancak yavaş ve halüsinasyonlara eğilimli. + - OpenAI TTS → biçim `>alloy OpenAI-TTS` → Çok dilli ancak bir OpenAI API key gerektirir + - Coqui XTTS → biçim `_XTTS_/AUTOMATIC.wav` → Sadece Çince (Basitleştirilmiş), İngilizce, Fransızca, Almanca, İtalyanca, Portekizce, Lehçe, Türkçe, Rusça, Hollandaca, Çekçe, Arapça, İspanyolca, Macarca, Korece ve Japonca için mevcut. + + --- + + # 🎤 R.V.C. ve R.V.C.2 Seslerini Nasıl Kullanılır (İsteğe Bağlı) 🎶 + + Amaç, oluşturulan TTS'ye bir R.V.C. uygulamaktır (Metin-okuma-sesi) 🎙️ + + 1. 'Özel Ses R.V.C.' sekmesinde, ihtiyacınız olan modelleri indirin 📥 Hugging Face ve Google Drive gibi bağlantıları, zip, pth veya index gibi formatlarda kullanabilirsiniz. Tam HF alanı depolarını da indirebilirsiniz, ancak bu seçenek çok kararlı değil 😕 + + 2. Şimdi, 'TTS'den R.V.C.'yi değiştirin' seçeneğini işaretleyin ✅ Bundan sonra, her TTS konuşucusuna uygulamak istediğiniz modelleri seçebilirsiniz 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Tüm R.V.C.'lere uygulanacak F0 yöntemini ayarlayın 🎛️ + + 4. Yaptığınız değişiklikleri uygulamak için 'YAPILAN AYARLARI UYGULA' düğmesine basın 🔄 + + 5. Video çevirisi sekmesine geri dönün ve 'Çevir' düğmesine tıklayın ▶️ Artık çeviri, R.V.C. uygulanarak yapılacaktır 🗣️ + + İpucu: En iyi TTS'leri veya yapılandırmaları R.V.C.'ye uygulamak için 'Test R.V.C.'yi kullanabilirsiniz 🧪🔍 + + --- + + """, + "tab_translate": "Video çevirisi", + "video_source": "Video Kaynağını Seçin", + "link_label": "Medya bağlantısı.", + "link_info": "Örnek: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL buraya girin...", + "dir_label": "Video Yolu.", + "dir_info": "Örnek: /usr/home/my_video.mp4", + "dir_ph": "Yol buraya girin...", + "sl_label": "Kaynak dil", + "sl_info": "Videoyun orijinal dilidir", + "tat_label": "Şuna çevir", + "tat_info": "Hedef dili seçin ve ayrıca o dil için uygun metin-okuma-sesini seçtiğinizden emin olun.", + "num_speakers": "Videoda kaç kişi konuşuyor seçin.", + "min_sk": "Min konuşmacılar", + "max_sk": "Max konuşmacılar", + "tts_select": "Her konuşmacı için istediğiniz sesi seçin.", + "sk1": "TTS Konuşmacı 1", + "sk2": "TTS Konuşmacı 2", + "sk3": "TTS Konuşmacı 3", + "sk4": "TTS Konuşmacı 4", + "sk5": "TTS Konuşmacı 5", + "sk6": "TTS Konuşmacı 6", + "sk7": "TTS Konuşmacı 7", + "sk8": "TTS Konuşmacı 8", + "sk9": "TTS Konuşmacı 9", + "sk10": "TTS Konuşmacı 10", + "sk11": "TTS Konuşmacı 11", + "sk12": "TTS Konuşmacı 12", + "vc_title": "Farklı Dillerde Ses Taklidi", + "vc_subtitle": """ + ### Bir kişinin sesini çeşitli dillere yayın. + Uygun şekilde kullanıldığında çoğu sesle etkili olsa da, her durumda mükemmelliği elde etmeyebilir. + Ses Taklidi yalnızca referans konuşucunun tonunu çoğaltır, aksan ve duygu dışında, + temel konuşucu TTS modeli tarafından yönetilen ve dönüştürücü tarafından çoğaltılmayanlar hariç. + Bu, her konuşmacı için ana ses kaydından ses örnekleri alır ve işler. + """, + "vc_active_label": "Aktif Ses Taklidi", + "vc_active_info": "Aktif Ses Taklidi: Orijinal konuşmacının tonunu çoğaltır", + "vc_method_label": "Yöntem", + "vc_method_info": "Ses Taklidi işlemi için bir yöntem seçin", + "vc_segments_label": "Maksimum örnekler", + "vc_segments_info": "Maksimum örnekler: İşlem için üretilecek ses örneklerinin sayısıdır, daha fazlası daha iyidir ancak gürültü ekleyebilir", + "vc_dereverb_label": "Yankıyı Azalt", + "vc_dereverb_info": "Yankıyı Azalt: Ses örneklerine yankı azaltma uygular.", + "vc_remove_label": "Önceki örnekleri Kaldır", + "vc_remove_info": "Önceki örnekleri Kaldır: Önceki üretilen örnekleri kaldırır, bu nedenle yeni olanları oluşturmak gerekir.", + "xtts_title": "Bir ses tabanlı TTS oluştur", + "xtts_subtitle": "Sesli bir sesle en fazla 10 saniyelik bir ses dosyası yükleyin. XTTS kullanarak, sağlanan ses dosyasına benzer bir sesle yeni bir TTS oluşturulur.", + "xtts_file_label": "Sesli bir sesle kısa bir ses dosyası yükleyin", + "xtts_name_label": "TTS için bir isim belirleyin", + "xtts_name_info": "Basit bir isim kullanın", + "xtts_dereverb_label": "Sesi Yankıdan Temizle", + "xtts_dereverb_info": "Sesi Yankıdan Temizle: Sese yankı azaltma uygular", + "xtts_button": "Ses işleme ve TTS seçimine dahil et", + "xtts_footer": "Ses xtts otomatik olarak oluştur: TTS seçicisinde `_XTTS_/AUTOMATIC.wav`ı kullanarak, çeviri oluştururken her konuşmacı için otomatik olarak bölümler oluşturabilirsiniz.", + "extra_setting": "Gelişmiş Ayarlar", + "acc_max_label": "Maksimum Ses Hızlandırması", + "acc_max_info": "Çakışmayı önlemek için çevrilen ses segmentlerinin maksimum hızlandırması. 1.0 değeri hiçbir hızlandırmayı temsil eder", + "acc_rate_label": "Hızlanma Oranı Düzenlemesi", + "acc_rate_info": "Hızlanma Oranı Düzenlemesi: Daha az hız gerektiren segmentlere uyum sağlamak için hızlanmayı ayarlar, sürekliliği korur ve sonraki başlangıç zamanını dikkate alır.", + "or_label": "Örtüşme Azaltma", + "or_info": "Örtüşme Azaltma: Önceki bitiş zamanlarına dayanarak başlangıç zamanlarını ayarlayarak segmentlerin örtüşmesini engeller; senkronizasyonu bozabilir.", + "aud_mix_label": "Ses Karıştırma Yöntemi", + "aud_mix_info": "Özgün ve çevrilmiş ses dosyalarını karıştırarak iki kullanılabilir karıştırma moduyla özelleştirilmiş, dengeli bir çıkış oluşturun.", + "vol_ori": "Özgün ses seviyesi", + "vol_tra": "Çevrilmiş ses seviyesi", + "voiceless_tk_label": "Sessiz Parça", + "voiceless_tk_info": "Sessiz Parça: Çevrilmiş sesle birleştirilmeden önce özgün sesleri kaldırır.", + "sub_type": "Altyazı türü", + "soft_subs_label": "Yumuşak Altyazılar", + "soft_subs_info": "Yumuşak Altyazılar: İzleyicilerin video izlerken açıp kapatabileceği isteğe bağlı altyazılar.", + "burn_subs_label": "Altyazıyı Yak", + "burn_subs_info": "Altyazıyı Yak: Altyazıları videoya gömerek, bunları görsel içeriğin kalıcı bir parçası haline getirir.", + "whisper_title": "Transkripsiyonu yapılandır.", + "lnum_label": "Sayıları Metinleştir", + "lnum_info": "Sayıları Metinleştir: Transkript içindeki sayısal temsilleri yazılı eşdeğerleriyle değiştirin.", + "scle_label": "Ses Temizliği", + "scle_info": "Ses Temizliği: Zaman damgası hassasiyeti için transkripsiyondan önce sesleri iyileştirin, arka plan gürültüsünü kaldırın. Bu işlem özellikle uzun ses dosyalarıyla zaman alabilir.", + "sd_limit_label": "Bölüm Süresi Sınırı", + "sd_limit_info": "Her bölüm için maksimum süreyi (saniye cinsinden) belirtin. Ses, her bölüm parçası için süreyi sınırlayarak VAD kullanılarak işlenecektir.", + "asr_model_info": "Varsayılan olarak 'Fısıldama modeli'ni kullanarak konuşma dilini metne dönüştürür. Özel bir model kullanın, örneğin, özel bir model kullanmak için açılan menüye 'BELLE-2/Belle-whisper-large-v3-zh' depo adını girin. Hugging Face'de ince ayarlı modeller bulun.", + "ctype_label": "Hesaplama Türü", + "ctype_info": "int8 veya float16 gibi daha küçük tipleri seçmek, bellek kullanımını azaltarak ve hesaplama verimliliğini artırarak performansı artırabilir, ancak float32 gibi daha büyük veri tiplerine göre hassasiyetten ödün verebilir.", + "batchz_label": "Toplu İş Boyutu", + "batchz_info": "GPU'nuzun daha az VRAM'a sahip olması durumunda toplu iş boyutunu azaltmak bellek tasarrufu sağlar ve Bellek Dışı Sorunları yönetmeye yardımcı olur.", + "tsscale_label": "Metin Bölme Ölçeği", + "tsscale_info": "Metni cümleler, kelimeler veya karakterler olarak bölümlere ayırın. Kelime ve karakter bölme, altyazılar için faydalı olan daha ince granülerlik sağlar; çeviriyi devre dışı bırakma, orijinal yapının korunmasını sağlar.", + "srt_file_label": "Bir SRT altyazı dosyası yükleyin (Whisper'ın transkripsiyonu yerine kullanılacaktır)", + "divide_text_label": "Metin bölümlerini yeniden böl:", + "divide_text_info": "(Deneysel) Mevcut metin segmentlerini kaynak dildeki ayraçla bölmek için bir ayraç girin. Aracı, bu ayraçları tanımlayacak ve buna göre yeni segmentler oluşturacaktır. Birden çok ayıraç belirtmek için | kullanın, örn .: !|?|...|。", + "diarization_label": "Diyarizasyon Modeli", + "tr_process_label": "Çeviri Süreci", + "out_type_label": "Çıkış Türü", + "out_name_label": "Dosya adı", + "out_name_info": "Çıkış dosyasının adı", + "task_sound_label": "Görev Durumu Ses", + "task_sound_info": "Görev Durumu Ses: Görev tamamlanması veya yürütme sırasında hataları belirten bir ses uyarısı çalar.", + "cache_label": "İlerlemeyi Getir", + "cache_info": "İlerlemeyi Getir: Son kontrol noktasından işlemi devam ettir.", + "preview_info": "Önizleme, test amaçları için videonun sadece 10 saniyelik kısmını keser. Lütfen tam video süresini almak için önizlemeyi devre dışı bırakın.", + "edit_sub_label": "Oluşturulan altyazıları düzenleyin", + "edit_sub_info": "Oluşturulan altyazıları düzenlemeyi sağlar: Çeviriyi 2 adımda çalıştırmanıza izin verir. İlk olarak 'ALTYAZILARI AL VE DÜZENLE' düğmesiyle altyazıları alır, bunları düzenleyebilir ve ardından 'ÇEVİR' düğmesine tıklayarak videoyu oluşturabilirsiniz", + "button_subs": "ALTYAZILARI AL VE DÜZENLE", + "editor_sub_label": "Oluşturulan altyazılar", + "editor_sub_info": "Burada oluşturulan altyazılardaki metni düzenleyebilirsiniz. Arayüz seçeneklerinde değişiklikler yapabilirsiniz, ancak 'Kaynak dil', 'Şuna çevir' ve 'Max konuşmacılar' dışında hata oluşmaması için 'ÇEVİR' düğmesine basmadan önce. Bitirdiğinizde, 'ÇEVİR' düğmesine tıklayın.", + "editor_sub_ph": "Altyazıları almak için önce 'ALTYAZILARI AL VE DÜZENLE'ye basın", + "button_translate": "ÇEVİR", + "output_result_label": "ÇEVİRİLEN VİDEOYU İNDİR", + "sub_ori": "Altyazılar", + "sub_tra": "Çevrilmiş altyazılar", + "ht_token_info": "Bir önemli adım, Pyannote kullanım lisans anlaşmasını kabul etmektir. Modelleri kullanmak için Hugging Face'de bir hesabınız olması ve lisansı kabul etmeniz gerekir: https://huggingface.co/pyannote/speaker-diarization ve https://huggingface.co/pyannote/segmentation. Anahtar JETONUNUZU buradan alın: https://hf.co/settings/tokens", + "ht_token_ph": "Jetona buradan girin...", + "tab_docs": "Belge çevirisi", + "docs_input_label": "Belge Kaynağını Seçin", + "docs_input_info": "PDF, DOCX, TXT veya metin olabilir", + "docs_source_info": "Bu, metnin orijinal dilidir", + "chunk_size_label": "TTS'nin her segment başına işleyeceği maksimum karakter sayısı", + "chunk_size_info": "0 değeri, TTS için dinamik ve daha uyumlu bir değer atar.", + "docs_button": "Dil Dönüşüm Köprüsünü Başlat", + "cv_url_info": "R.V.C. modellerini otomatik olarak URL'den indirin. HuggingFace veya Drive bağlantılarını kullanabilir ve her birini virgülle ayırarak birden çok bağlantı ekleyebilirsiniz. Örnek: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Ses: TTS'den R.V.C.'ye Değiştir", + "sec1_title": "### 1. Kullanımını etkinleştirmek için onu işaretleyin.", + "enable_replace": "Modellerin kullanımını etkinleştirmek için bunu işaretleyin.", + "sec2_title": "### 2. Her karşılık gelen konuşmacı TTS'sine uygulanacak bir ses seçin ve yapılandırmaları uygulayın.", + "sec2_subtitle": "Kullanacağınız sayısına bağlı olarak, her biri kendi modeline ihtiyaç duyar. Ayrıca, konuşmacı doğru şekilde tespit edilmezse, bir yardımcı model de bulunmaktadır.", + "cv_tts1": "Konuşmacı 1 için uygulanacak sesi seçin.", + "cv_tts2": "Konuşmacı 2 için uygulanacak sesi seçin.", + "cv_tts3": "Konuşmacı 3 için uygulanacak sesi seçin.", + "cv_tts4": "Konuşmacı 4 için uygulanacak sesi seçin.", + "cv_tts5": "Konuşmacı 5 için uygulanacak sesi seçin.", + "cv_tts6": "Konuşmacı 6 için uygulanacak sesi seçin.", + "cv_tts7": "Konuşmacı 7 için uygulanacak sesi seçin.", + "cv_tts8": "Konuşmacı 8 için uygulanacak sesi seçin.", + "cv_tts9": "Konuşmacı 9 için uygulanacak sesi seçin.", + "cv_tts10": "Konuşmacı 10 için uygulanacak sesi seçin.", + "cv_tts11": "Konuşmacı 11 için uygulanacak sesi seçin.", + "cv_tts12": "Konuşmacı 12 için uygulanacak sesi seçin.", + "cv_aux": "- Konuşmacı doğru şekilde algılanamadığında uygulanacak ses.", + "cv_button_apply": "AYARLARI UYGULA", + "tab_help": "Yardım", + }, + "indonesian": { + "description": """ + ### 🎥 **Terjemahkan video dengan mudah menggunakan SoniTranslate!** 📽️ + + Unggah video, file audio, atau berikan tautan YouTube. 📽️ **Dapatkan buku catatan yang diperbarui dari repositori resmi: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Lihat tab `Bantuan` untuk petunjuk penggunaan. Mari mulai bersenang-senang dengan menerjemahkan video! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Petunjuk penggunaan:** + + 1. 📤 Unggah sebuah **video**, **file audio** atau berikan sebuah tautan 🌐 **YouTube.** + + 2. 🌍 Pilih bahasa di mana Anda ingin **menerjemahkan video** tersebut. + + 3. 🗣️ Tentukan **jumlah orang yang berbicara** dalam video dan **berikan masing-masing suara teks-ke-suara yang sesuai** untuk bahasa terjemahan. + + 4. 🚀 Tekan tombol '**Terjemahkan**' untuk mendapatkan hasilnya. + + --- + + # 🧩 **SoniTranslate mendukung berbagai mesin TTS (Teks-ke-Suara), yaitu:** + - EDGE-TTS → format `en-AU-WilliamNeural-Male` → Cepat dan akurat. + - FACEBOOK MMS → format `en-facebook-mms VITS` → Suara lebih alami; saat ini, hanya menggunakan CPU. + - PIPER TTS → format `en_US-lessac-high VITS-onnx` → Sama seperti sebelumnya, tetapi dioptimalkan untuk CPU dan GPU. + - BARK → format `en_speaker_0-Male BARK` → Kualitas bagus tetapi lambat, dan rentan terhadap halusinasi. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Multibahasa tetapi membutuhkan OpenAI API key + - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Hanya tersedia untuk Cina (Sederhana), Inggris, Prancis, Jerman, Italia, Portugis, Polandia, Turki, Rusia, Belanda, Ceko, Arab, Spanyol, Hungaria, Korea, dan Jepang. + + --- + + # 🎤 Cara Menggunakan Suara R.V.C. dan R.V.C.2 (Opsional) 🎶 + + Tujuannya adalah menerapkan R.V.C. pada TTS yang dihasilkan (Teks-ke-Suara) 🎙️ + + 1. Di tab `Suara Kustom R.V.C.`, unduh model-model yang Anda butuhkan 📥 Anda dapat menggunakan tautan dari Hugging Face dan Google Drive dalam format zip, pth, atau index. Anda juga dapat mengunduh repositori ruang HF lengkap, tetapi opsi ini tidak sangat stabil 😕 + + 2. Sekarang, pergi ke `Ganti suara: TTS ke R.V.C.` dan centang kotak `aktifkan` ✅ Setelah ini, Anda dapat memilih model yang ingin Anda terapkan pada setiap pembicara TTS 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Sesuaikan metode F0 yang akan diterapkan pada semua R.V.C. 🎛️ + + 4. Tekan `TERAPKAN KONFIGURASI` untuk menerapkan perubahan yang Anda buat 🔄 + + 5. Kembali ke tab terjemahan video dan klik 'Terjemahkan' ▶️ Sekarang, terjemahan akan dilakukan dengan menerapkan R.V.C. 🗣️ + + Tip: Anda dapat menggunakan `Uji R.V.C.` untuk bereksperimen dan menemukan TTS atau konfigurasi terbaik untuk diterapkan pada R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "Terjemahan Video", + "video_source": "Pilih Sumber Video", + "link_label": "Tautan Media.", + "link_info": "Contoh: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL masukkan di sini...", + "dir_label": "Path Video.", + "dir_info": "Contoh: /usr/home/my_video.mp4", + "dir_ph": "Path masukkan di sini...", + "sl_label": "Bahasa Sumber", + "sl_info": "Ini adalah bahasa asli video", + "tat_label": "Terjemahkan audio ke", + "tat_info": "Pilih bahasa target dan pastikan juga memilih TTS yang sesuai untuk bahasa tersebut.", + "num_speakers": "Pilih berapa banyak orang yang berbicara dalam video.", + "min_sk": "Pembicara minimum", + "max_sk": "Pembicara maksimum", + "tts_select": "Pilih suara yang Anda inginkan untuk setiap pembicara.", + "sk1": "Pembicara TTS 1", + "sk2": "Pembicara TTS 2", + "sk3": "Pembicara TTS 3", + "sk4": "Pembicara TTS 4", + "sk5": "Pembicara TTS 5", + "sk6": "Pembicara TTS 6", + "sk7": "Pembicara TTS 7", + "sk8": "Pembicara TTS 8", + "sk9": "Pembicara TTS 9", + "sk10": "Pembicara TTS 10", + "sk11": "Pembicara TTS 11", + "sk12": "Pembicara TTS 12", + "vc_title": "Imitasi Suara dalam Berbagai Bahasa", + "vc_subtitle": """ + ### Reproduksi suara seseorang di berbagai bahasa. + Meskipun efektif dengan kebanyakan suara ketika digunakan dengan tepat, mungkin tidak mencapai kesempurnaan dalam setiap kasus. + Imitasi Suara hanya mereproduksi nada pembicara referensi, mengecualikan aksen dan emosi, yang dikendalikan oleh model TTS pembicara dasar dan tidak direplikasi oleh konverter. + Ini akan mengambil sampel audio dari audio utama untuk setiap pembicara dan memprosesnya. + """, + "vc_active_label": "Imitasi Suara Aktif", + "vc_active_info": "Imitasi Suara Aktif: Mereplikasi nada pembicara asli", + "vc_method_label": "Metode", + "vc_method_info": "Pilih metode untuk proses Imitasi Suara", + "vc_segments_label": "Sampel maksimum", + "vc_segments_info": "Sampel maksimum: Jumlah sampel audio yang akan dihasilkan untuk proses, semakin banyak lebih baik tetapi dapat menambah noise", + "vc_dereverb_label": "Dereverb", + "vc_dereverb_info": "Dereverb: Menyertakan dereverb vokal ke sampel audio.", + "vc_remove_label": "Hapus sampel sebelumnya", + "vc_remove_info": "Hapus sampel sebelumnya: Menghapus sampel sebelumnya yang dihasilkan, sehingga yang baru perlu dibuat.", + "xtts_title": "Buat TTS berdasarkan audio", + "xtts_subtitle": "Unggah file audio dengan durasi maksimal 10 detik dengan suara. Dengan menggunakan XTTS, TTS baru akan dibuat dengan suara mirip dengan file audio yang diberikan.", + "xtts_file_label": "Unggah audio pendek dengan suara", + "xtts_name_label": "Nama untuk TTS", + "xtts_name_info": "Gunakan nama sederhana", + "xtts_dereverb_label": "Dereverb audio", + "xtts_dereverb_info": "Dereverb audio: Menyertakan dereverb vokal ke audio", + "xtts_button": "Proses audio dan masukkan ke dalam pemilih TTS", + "xtts_footer": "Hasilkan xtts suara secara otomatis: Anda dapat menggunakan `_XTTS_/AUTOMATIC.wav` di pemilih TTS untuk secara otomatis menghasilkan segmen untuk setiap pembicara saat menghasilkan terjemahan.", + "extra_setting": "Pengaturan Lanjutan", + "acc_max_label": "Akselerasi Audio maksimum", + "acc_max_info": "Akselerasi maksimum untuk segmen audio yang diterjemahkan untuk menghindari tumpang tindih. Nilai 1.0 mewakili tidak ada akselerasi", + "acc_rate_label": "Regulasi Tingkat Akselerasi", + "acc_rate_info": "Regulasi Tingkat Akselerasi: Menyesuaikan akselerasi untuk mengakomodasi segmen yang membutuhkan kecepatan lebih rendah, menjaga kontinuitas, dan mempertimbangkan waktu mulai berikutnya.", + "or_label": "Pengurangan Tumpang Tindih", + "or_info": "Pengurangan Tumpang Tindih: Memastikan segmen tidak tumpang tindih dengan menyesuaikan waktu mulai berdasarkan waktu selesai sebelumnya; bisa mengganggu sinkronisasi.", + "aud_mix_label": "Metode Penggabungan Audio", + "aud_mix_info": "Gabungkan file audio asli dan diterjemahkan untuk membuat output yang seimbang dengan dua mode pencampuran yang tersedia.", + "vol_ori": "Volume audio asli", + "vol_tra": "Volume audio yang diterjemahkan", + "voiceless_tk_label": "Track Tanpa Suara", + "voiceless_tk_info": "Track Tanpa Suara: Hapus suara audio asli sebelum menggabungkannya dengan audio yang diterjemahkan.", + "sub_type": "Tipe Subtitle", + "soft_subs_label": "Subtitel Lembut", + "soft_subs_info": "Subtitel Lembut: Subtitel opsional yang dapat ditonton penonton saat menonton video.", + "burn_subs_label": "Bakar Subtitle", + "burn_subs_info": "Bakar Subtitle: Menyematkan subtitle ke dalam video, menjadikannya bagian permanen dari konten visual.", + "whisper_title": "Konfigurasi transkripsi.", + "lnum_label": "Literalisasi Angka", + "lnum_info": "Literalisasi Angka: Gantikan representasi numerik dengan ekivalen tertulisnya dalam transkrip.", + "scle_label": "Pembersihan Suara", + "scle_info": "Pembersihan Suara: Tingkatkan vokal, hapus kebisingan latar belakang sebelum transkripsi untuk presisi timestamp maksimum. Operasi ini bisa memakan waktu, terutama dengan file audio yang panjang.", + "sd_limit_label": "Batas Durasi Segment", + "sd_limit_info": "Tentukan durasi maksimum (dalam detik) untuk setiap segmen. Audio akan diproses menggunakan VAD, membatasi durasi untuk setiap potongan segmen.", + "asr_model_info": "Ini mengubah bahasa yang diucapkan menjadi teks menggunakan model 'Whisper' secara default. Gunakan model kustom, misalnya, dengan memasukkan nama repositori 'BELLE-2/Belle-whisper-large-v3-zh' dalam dropdown untuk menggunakan model yang disesuaikan bahasa Cina. Temukan model yang disesuaikan di Hugging Face.", + "ctype_label": "Jenis Perhitungan", + "ctype_info": "Memilih tipe yang lebih kecil seperti int8 atau float16 dapat meningkatkan kinerja dengan mengurangi penggunaan memori dan meningkatkan throughput komputasi, tetapi dapat mengorbankan presisi dibandingkan dengan tipe data yang lebih besar seperti float32.", + "batchz_label": "Ukuran Batch", + "batchz_info": "Mengurangi ukuran batch menghemat memori jika GPU Anda memiliki VRAM yang lebih sedikit dan membantu mengelola masalah Out of Memory.", + "tsscale_label": "Skala Segmentasi Teks", + "tsscale_info": "Bagi teks menjadi segmen berdasarkan kalimat, kata, atau karakter. Segmentasi kata dan karakter menawarkan granularitas yang lebih halus, berguna untuk subjudul; menonaktifkan terjemahan mempertahankan struktur asli.", + "srt_file_label": "Unggah file subtitle SRT (akan digunakan sebagai gantinya dari transkripsi Whisper)", + "divide_text_label": "Bagi ulang segmen teks dengan:", + "divide_text_info": "(Eksperimental) Masukkan pemisah untuk membagi segmen teks yang ada dalam bahasa sumber. Alat ini akan mengidentifikasi kejadian dan membuat segmen baru sesuai. Tentukan beberapa pemisah menggunakan |, misalnya: !|?|...|。", + "diarization_label": "Model Diarization", + "tr_process_label": "Proses Penerjemahan", + "out_type_label": "Jenis Output", + "out_name_label": "Nama file", + "out_name_info": "Nama file output", + "task_sound_label": "Suara Status Tugas", + "task_sound_info": "Suara Status Tugas: Memainkan suara peringatan yang menandakan penyelesaian tugas atau kesalahan selama pelaksanaan.", + "cache_label": "Pemulihan Kemajuan", + "cache_info": "Pemulihan Kemajuan: Melanjutkan proses dari titik kontrol terakhir.", + "preview_info": "Pratinjau memotong video menjadi hanya 10 detik untuk tujuan pengujian. Harap nonaktifkan untuk mendapatkan durasi video penuh.", + "edit_sub_label": "Edit subtitle yang dihasilkan", + "edit_sub_info": "Edit subtitle yang dihasilkan: Memungkinkan Anda menjalankan terjemahan dalam 2 langkah. Pertama dengan tombol 'DAPATKAN SUBTITLES DAN EDIT', Anda mendapatkan subtitle untuk diedit, dan kemudian dengan tombol 'TERJEMAHKAN', Anda dapat menghasilkan video", + "button_subs": "DAPATKAN SUBTITLES DAN EDIT", + "editor_sub_label": "Subtitle yang dihasilkan", + "editor_sub_info": "Silakan sunting teks dalam subtitle yang dihasilkan di sini. Anda dapat membuat perubahan pada opsi antarmuka sebelum mengklik tombol 'TERJEMAHKAN', kecuali untuk 'Bahasa Sumber', 'Terjemahkan audio ke', dan 'Pembicara maksimum', untuk menghindari kesalahan. Setelah selesai, klik tombol 'TERJEMAHKAN'.", + "editor_sub_ph": "Pertama tekan 'DAPATKAN SUBTITLES DAN EDIT' untuk mendapatkan subtitle", + "button_translate": "TERJEMAHKAN", + "output_result_label": "UNDUH VIDEO TERJEMAHAN", + "sub_ori": "Subtitle", + "sub_tra": "Subtitle Terjemahan", + "ht_token_info": "Langkah penting adalah menerima perjanjian lisensi untuk menggunakan Pyannote. Anda perlu memiliki akun di Hugging Face dan menerima lisensi untuk menggunakan model: https://huggingface.co/pyannote/speaker-diarization dan https://huggingface.co/pyannote/segmentation. Dapatkan TOKEN KUNCI Anda di sini: https://hf.co/settings/tokens", + "ht_token_ph": "Token masukkan di sini...", + "tab_docs": "Terjemahan Dokumen", + "docs_input_label": "Pilih Sumber Dokumen", + "docs_input_info": "Ini bisa berupa PDF, DOCX, TXT, atau teks", + "docs_source_info": "Ini adalah bahasa asli teks", + "chunk_size_label": "Jumlah maksimum karakter yang akan diproses oleh TTS per segmen", + "chunk_size_info": "Nilai 0 menetapkan nilai dinamis dan lebih kompatibel untuk TTS.", + "docs_button": "Mulai Jembatan Konversi Bahasa", + "cv_url_info": "Unduh model R.V.C. secara otomatis dari URL. Anda dapat menggunakan tautan dari HuggingFace atau Drive, dan Anda dapat menyertakan beberapa tautan, masing-masing dipisahkan oleh koma. Contoh: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Ganti suara: TTS ke R.V.C.", + "sec1_title": "### 1. Untuk mengaktifkan penggunaannya, tandai sebagai aktif.", + "enable_replace": "Centang ini untuk mengaktifkan penggunaan model.", + "sec2_title": "### 2. Pilih suara yang akan diterapkan untuk setiap TTS dari setiap pembicara yang sesuai dan terapkan konfigurasinya.", + "sec2_subtitle": "Tergantung pada berapa banyak yang akan Anda gunakan, masing-masing memerlukan model yang sesuai. Selain itu, ada satu tambahan jika dengan beberapa alasan pembicara tidak terdeteksi dengan benar.", + "cv_tts1": "Pilih suara yang akan diterapkan untuk Pembicara 1.", + "cv_tts2": "Pilih suara yang akan diterapkan untuk Pembicara 2.", + "cv_tts3": "Pilih suara yang akan diterapkan untuk Pembicara 3.", + "cv_tts4": "Pilih suara yang akan diterapkan untuk Pembicara 4.", + "cv_tts5": "Pilih suara yang akan diterapkan untuk Pembicara 5.", + "cv_tts6": "Pilih suara yang akan diterapkan untuk Pembicara 6.", + "cv_tts7": "Pilih suara yang akan diterapkan untuk Pembicara 7.", + "cv_tts8": "Pilih suara yang akan diterapkan untuk Pembicara 8.", + "cv_tts9": "Pilih suara yang akan diterapkan untuk Pembicara 9.", + "cv_tts10": "Pilih suara yang akan diterapkan untuk Pembicara 10.", + "cv_tts11": "Pilih suara yang akan diterapkan untuk Pembicara 11.", + "cv_tts12": "Pilih suara yang akan diterapkan untuk Pembicara 12.", + "cv_aux": "- Suara yang akan diterapkan jika Pembicara tidak terdeteksi dengan sukses.", + "cv_button_apply": "TERAPKAN KONFIGURASI", + "tab_help": "Bantuan", + }, + "portuguese": { + "description": """ + ### 🎥 **Traduza vídeos facilmente com o SoniTranslate!** 📽️ + + Carregue um vídeo, arquivo de áudio ou forneça um link do YouTube. 📽️ **Obtenha o caderno atualizado do repositório oficial: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Consulte a guia `Ajuda` para instruções sobre como usá-lo. Vamos começar a nos divertir com a tradução de vídeos! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Instruções de uso:** + + 1. 📤 Carregue um **vídeo**, **arquivo de áudio** ou forneça um 🌐 **link do YouTube**. + + 2. 🌍 Escolha o idioma para o qual você deseja **traduzir o vídeo**. + + 3. 🗣️ Especifique o **número de pessoas falando** no vídeo e **atribua a cada uma uma voz de texto para fala** adequada ao idioma da tradução. + + 4. 🚀 Pressione o botão '**Traduzir**' para obter os resultados. + + --- + + # 🧩 **SoniTranslate suporta diferentes motores TTS (Texto para Fala), que são:** + - EDGE-TTS → formato `en-AU-WilliamNeural-Male` → Rápido e preciso. + - FACEBOOK MMS → formato `en-facebook-mms VITS` → A voz é mais natural; no momento, usa apenas CPU. + - PIPER TTS → formato `en_US-lessac-high VITS-onnx` → O mesmo que o anterior, mas é otimizado para CPU e GPU. + - BARK → formato `en_speaker_0-Male BARK` → Boa qualidade, mas lento e propenso a alucinações. + - OpenAI TTS → formato `>alloy OpenAI-TTS` → Multilíngue mas requer uma OpenAI API key + - Coqui XTTS → formato `_XTTS_/AUTOMATIC.wav` → Disponível apenas para Chinês (Simplificado), Inglês, Francês, Alemão, Italiano, Português, Polonês, Turco, Russo, Holandês, Tcheco, Árabe, Espanhol, Húngaro, Coreano e Japonês. + + --- + + # 🎤 Como Usar Vozes R.V.C. e R.V.C.2 (Opcional) 🎶 + + O objetivo é aplicar um R.V.C. ao TTS (Texto para Fala) gerado 🎙️ + + 1. Na aba `Voz Personalizada R.V.C.`, baixe os modelos que você precisa 📥 Você pode usar links do Hugging Face e Google Drive em formatos como zip, pth ou índice. Você também pode baixar repositórios completos do espaço HF, mas essa opção não é muito estável 😕 + + 2. Agora, vá para `Substituir voz: TTS para R.V.C.` e marque a caixa de seleção `habilitar` ✅ Após isso, você pode escolher os modelos que deseja aplicar a cada falante TTS 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Ajuste o método F0 que será aplicado a todos os R.V.C. 🎛️ + + 4. Pressione `APLICAR CONFIGURAÇÃO` para aplicar as alterações feitas 🔄 + + 5. Volte para a aba de tradução de vídeo e clique em 'Traduzir' ▶️ Agora, a tradução será feita aplicando o R.V.C. 🗣️ + + Dica: Você pode usar `Testar R.V.C.` para experimentar e encontrar o melhor TTS ou configurações para aplicar ao R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "Tradução de Vídeo", + "video_source": "Escolha a Fonte do Vídeo", + "link_label": "Link do Mídia.", + "link_info": "Exemplo: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL aqui...", + "dir_label": "Caminho do Vídeo.", + "dir_info": "Exemplo: /usr/home/meu_video.mp4", + "dir_ph": "Caminho aqui...", + "sl_label": "Idioma de Origem", + "sl_info": "Este é o idioma original do vídeo", + "tat_label": "Traduzir áudio para", + "tat_info": "Selecione o idioma de destino e também certifique-se de escolher o TTS correspondente para esse idioma.", + "num_speakers": "Selecione quantas pessoas estão falando no vídeo.", + "min_sk": "Mín. falantes", + "max_sk": "Máx. falantes", + "tts_select": "Selecione a voz desejada para cada falante.", + "sk1": "Falante TTS 1", + "sk2": "Falante TTS 2", + "sk3": "Falante TTS 3", + "sk4": "Falante TTS 4", + "sk5": "Falante TTS 5", + "sk6": "Falante TTS 6", + "sk7": "Falante TTS 7", + "sk8": "Falante TTS 8", + "sk9": "Falante TTS 9", + "sk10": "Falante TTS 10", + "sk11": "Falante TTS 11", + "sk12": "Falante TTS 12", + "vc_title": "Imitação de Voz em Diferentes Idiomas", + "vc_subtitle": """ + ### Reproduza a voz de uma pessoa em vários idiomas. + Embora eficaz com a maioria das vozes quando usada adequadamente, pode não alcançar a perfeição em todos os casos. + A Imitação de Voz replica apenas o tom do falante de referência, excluindo sotaque e emoção, que são governados pelo modelo TTS do falante base e não replicados pelo conversor. + Isso pegará amostras de áudio do áudio principal para cada falante e as processará. + """, + "vc_active_label": "Ativar Imitação de Voz", + "vc_active_info": "Ativar Imitação de Voz: Replica o tom do falante original", + "vc_method_label": "Método", + "vc_method_info": "Selecione um método para o processo de Imitação de Voz", + "vc_segments_label": "Máx. amostras", + "vc_segments_info": "Máx. amostras: É o número de amostras de áudio que serão geradas para o processo, mais é melhor, mas pode adicionar ruído", + "vc_dereverb_label": "Dereverb", + "vc_dereverb_info": "Dereverb: Aplica dereverb vocal às amostras de áudio.", + "vc_remove_label": "Remover amostras anteriores", + "vc_remove_info": "Remover amostras anteriores: Remove as amostras geradas anteriormente, então novas precisam ser criadas.", + "xtts_title": "Criar um TTS baseado em um áudio", + "xtts_subtitle": "Carregue um arquivo de áudio de no máximo 10 segundos com uma voz. Usando o XTTS, um novo TTS será criado com uma voz semelhante ao arquivo de áudio fornecido.", + "xtts_file_label": "Carregar um áudio curto com a voz", + "xtts_name_label": "Nome para o TTS", + "xtts_name_info": "Use um nome simples", + "xtts_dereverb_label": "Dereverb do áudio", + "xtts_dereverb_info": "Dereverb do áudio: Aplica dereverb vocal ao áudio", + "xtts_button": "Processar o áudio e incluí-lo no seletor de TTS", + "xtts_footer": "Gerar voz xtts automaticamente: Você pode usar `_XTTS_/AUTOMATIC.wav` no seletor de TTS para gerar automaticamente segmentos para cada falante ao gerar a tradução.", + "extra_setting": "Configurações Avançadas", + "acc_max_label": "Máx. Aceleração de Áudio", + "acc_max_info": "Aceleração máxima para segmentos de áudio traduzidos para evitar sobreposições. Um valor de 1.0 representa nenhuma aceleração", + "acc_rate_label": "Regulação da Taxa de Aceleração", + "acc_rate_info": "Regulação da Taxa de Aceleração: Ajusta a aceleração para acomodar segmentos que exigem menos velocidade, mantendo a continuidade e considerando o tempo de próximo início.", + "or_label": "Redução de sobreposição", + "or_info": "Redução de sobreposição: Garante que os segmentos não se sobreponham ajustando os horários de início com base nos horários de término anteriores; pode perturbar a sincronização.", + "aud_mix_label": "Método de Mistura de Áudio", + "aud_mix_info": "Misture arquivos de áudio original e traduzido para criar uma saída personalizada e equilibrada com dois modos de mistura disponíveis.", + "vol_ori": "Volume do áudio original", + "vol_tra": "Volume do áudio traduzido", + "voiceless_tk_label": "Faixa sem Voz", + "voiceless_tk_info": "Faixa sem Voz: Remova as vozes de áudio originais antes de combiná-las com o áudio traduzido.", + "sub_type": "Tipo de Legenda", + "soft_subs_label": "Legendas Suaves", + "soft_subs_info": "Legendas Suaves: Legendas opcionais que os espectadores podem ligar ou desligar enquanto assistem ao vídeo.", + "burn_subs_label": "Queimar Legendas", + "burn_subs_info": "Queimar Legendas: Incorporar legendas no vídeo, tornando-as uma parte permanente do conteúdo visual.", + "whisper_title": "Configurar transcrição.", + "lnum_label": "Literalizar Números", + "lnum_info": "Literalizar Números: Substituir representações numéricas por seus equivalentes escritos na transcrição.", + "scle_label": "Limpeza de Som", + "scle_info": "Limpeza de Som: Aprimorar vocais, remover ruído de fundo antes da transcrição para máxima precisão de marcação de tempo. Esta operação pode levar tempo, especialmente com arquivos de áudio longos.", + "sd_limit_label": "Limite de Duração do Segmento", + "sd_limit_info": "Especifique a duração máxima (em segundos) para cada segmento. O áudio será processado usando VAD, limitando a duração para cada fragmento de segmento.", + "asr_model_info": "Ele converte linguagem falada em texto usando o modelo 'Whisper' por padrão. Use um modelo personalizado, por exemplo, inserindo o nome do repositório 'BELLE-2/Belle-whisper-large-v3-zh' no menu suspenso para utilizar um modelo em chinês finetuned. Encontre modelos finetuned na Hugging Face.", + "ctype_label": "Tipo de Cálculo", + "ctype_info": "Escolher tipos menores como int8 ou float16 pode melhorar o desempenho, reduzindo o uso de memória e aumentando o throughput computacional, mas pode sacrificar a precisão em comparação com tipos de dados maiores como float32.", + "batchz_label": "Tamanho do Lote", + "batchz_info": "Reduzir o tamanho do lote economiza memória se sua GPU tiver menos VRAM e ajuda a gerenciar problemas de Memória Insuficiente.", + "tsscale_label": "Escala de Segmentação de Texto", + "tsscale_info": "Divida o texto em segmentos por frases, palavras ou caracteres. A segmentação por palavras e caracteres oferece granularidade mais fina, útil para legendas; desativar a tradução preserva a estrutura original.", + "srt_file_label": "Carregar um arquivo de legenda SRT (será usado em vez da transcrição de Whisper)", + "divide_text_label": "Redividir segmentos de texto por:", + "divide_text_info": "(Experimental) Insira um separador para dividir os segmentos de texto existentes no idioma de origem. A ferramenta identificará as ocorrências e criará novos segmentos conforme necessário. Especifique vários separadores usando |, por exemplo: !|?|...|。", + "diarization_label": "Modelo de Diarização", + "tr_process_label": "Processo de Tradução", + "out_type_label": "Tipo de Saída", + "out_name_label": "Nome do Arquivo", + "out_name_info": "O nome do arquivo de saída", + "task_sound_label": "Som do Estado da Tarefa", + "task_sound_info": "Som do Estado da Tarefa: Reproduz um alerta sonoro indicando a conclusão da tarefa ou erros durante a execução.", + "cache_label": "Recuperar Progresso", + "cache_info": "Recuperar Progresso: Continuar processo a partir do último checkpoint.", + "preview_info": "A prévia corta o vídeo para apenas 10 segundos para fins de teste. Por favor, desative para recuperar a duração completa do vídeo.", + "edit_sub_label": "Editar legendas geradas", + "edit_sub_info": "Editar legendas geradas: Permite executar a tradução em 2 etapas. Primeiro, com o botão 'OBTER LEGENDAS E EDITAR', você obtém as legendas para editá-las, e depois, com o botão 'TRADUZIR', você pode gerar o vídeo", + "button_subs": "OBTER LEGENDAS E EDITAR", + "editor_sub_label": "Legendas geradas", + "editor_sub_info": "Sinta-se à vontade para editar o texto nas legendas geradas aqui. Você pode fazer alterações nas opções de interface antes de clicar no botão 'TRADUZIR', exceto para 'Idioma de Origem', 'Traduzir áudio para' e 'Max. falantes', para evitar erros. Quando terminar, clique no botão 'TRADUZIR'.", + "editor_sub_ph": "Primeiro pressione 'OBTER LEGENDAS E EDITAR' para obter as legendas", + "button_translate": "TRADUZIR", + "output_result_label": "BAIXAR VÍDEO TRADUZIDO", + "sub_ori": "Legendas Originais", + "sub_tra": "Legendas Traduzidas", + "ht_token_info": "Um passo importante é aceitar o acordo de licença para usar o Pyannote. Você precisa ter uma conta no Hugging Face e aceitar a licença para usar os modelos: https://huggingface.co/pyannote/speaker-diarization e https://huggingface.co/pyannote/segmentation. Obtenha seu TOKEN CHAVE aqui: https://hf.co/settings/tokens", + "ht_token_ph": "Token aqui...", + "tab_docs": "Tradução de Documentos", + "docs_input_label": "Escolha a Fonte do Documento", + "docs_input_info": "Pode ser PDF, DOCX, TXT ou texto", + "docs_source_info": "Este é o idioma original do texto", + "chunk_size_label": "Máx. número de caracteres que o TTS processará por segmento", + "chunk_size_info": "Um valor de 0 atribui um valor dinâmico e mais compatível para o TTS.", + "docs_button": "Iniciar Ponte de Conversão de Idioma", + "cv_url_info": "Baixe automaticamente os modelos R.V.C. do URL. Você pode usar links do HuggingFace ou Drive, e pode incluir vários links, cada um separado por uma vírgula. Exemplo: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Substituir voz: TTS para R.V.C.", + "sec1_title": "### 1. Para habilitar seu uso, marque como habilitado.", + "enable_replace": "Marque isso para habilitar o uso dos modelos.", + "sec2_title": "### 2. Selecione uma voz que será aplicada a cada TTS de cada falante correspondente e aplique as configurações.", + "sec2_subtitle": "Dependendo de quantos você usará, cada um precisa do seu respectivo modelo. Além disso, há um auxiliar se, por algum motivo, o falante não for detectado corretamente.", + "cv_tts1": "Escolha a voz para aplicar ao Falante 1.", + "cv_tts2": "Escolha a voz para aplicar ao Falante 2.", + "cv_tts3": "Escolha a voz para aplicar ao Falante 3.", + "cv_tts4": "Escolha a voz para aplicar ao Falante 4.", + "cv_tts5": "Escolha a voz para aplicar ao Falante 5.", + "cv_tts6": "Escolha a voz para aplicar ao Falante 6.", + "cv_tts7": "Escolha a voz para aplicar ao Falante 7.", + "cv_tts8": "Escolha a voz para aplicar ao Falante 8.", + "cv_tts9": "Escolha a voz para aplicar ao Falante 9.", + "cv_tts10": "Escolha a voz para aplicar ao Falante 10.", + "cv_tts11": "Escolha a voz para aplicar ao Falante 11.", + "cv_tts12": "Escolha a voz para aplicar ao Falante 12.", + "cv_aux": "- Voz para aplicar caso um Falante não seja detectado com sucesso.", + "cv_button_apply": "APLICAR CONFIGURAÇÃO", + "tab_help": "Ajuda", + }, + "hindi": { + "description": """ + ### 🎥 **SoniTranslate के साथ वीडियो को आसानी से अनुवादित करें!** 📽️ + + एक वीडियो, ऑडियो फ़ाइल अपलोड करें या एक YouTube लिंक प्रदान करें। 📽️ **आधिकारिक भंडार से अपडेटेड नोटबुक प्राप्त करें: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + उसे 'मदद' टैब देखें इसका उपयोग कैसे करना है के निर्देशों के लिए। वीडियो अनुवाद के साथ मज़े करना शुरू करें! 🚀🎉 + """, + "tutorial": """ + # 🔰 **उपयोग के लिए निर्देश:** + + 1. 📤 **वीडियो**, **ऑडियो फ़ाइल** अपलोड करें या एक 🌐 **YouTube लिंक** प्रदान करें। + + 2. 🌍 चुनें कि आप किस भाषा में **वीडियो को अनुवादित** करना चाहते हैं। + + 3. 🗣️ वीडियो में **बोलने वाले लोगों की संख्या** और **प्रत्येक को टेक्स्ट-टू-स्पीच आवाज** देने का निर्देश दें, जो अनुवाद भाषा के लिए उपयुक्त है। + + 4. 🚀 '**अनुवाद**' बटन दबाएं और परिणाम प्राप्त करें। + + --- + + # 🧩 **SoniTranslate विभिन्न TTS (टेक्स्ट-टू-स्पीच) इंजनों का समर्थन करता है, जो हैं:** + - EDGE-TTS → प्रारूप `en-AU-WilliamNeural-Male` → तेज़ और सटीक। + - FACEBOOK MMS → प्रारूप `en-facebook-mms VITS` → आवाज अधिक प्राकृतिक है; वर्तमान में, यह केवल CPU का उपयोग करता है। + - PIPER TTS → प्रारूप `en_US-lessac-high VITS-onnx` → पिछले वाले के समान, लेकिन यह CPU और GPU दोनों के लिए अनुकूलित है। + - BARK → प्रारूप `en_speaker_0-Male BARK` → अच्छी गुणवत्ता है लेकिन धीमी, और यह हैलुसिनेशन के लिए प्रवृत्त है। + - OpenAI TTS → प्रारूप `>alloy OpenAI-TTS` → बहुभाषी लेकिन इसमें एक OpenAI API key की आवश्यकता है + - Coqui XTTS → प्रारूप `_XTTS_/AUTOMATIC.wav` → केवल चीनी (सरलीकृत), अंग्रेजी, फ्रेंच, जर्मन, इतालवी, पुर्तगाली, पोलिश, तुर्की, रूसी, डच, चेक, अरबी, स्पैनिश, हंगेरियन, कोरियाई और जापानी के लिए ही उपलब्ध है। + + --- + + # 🎤 R.V.C. और R.V.C.2 आवाज़ों का उपयोग कैसे करें (वैकल्पिक) 🎶 + + लक्ष्य है कि जेनेरेटेड TTS (टेक्स्ट-टू-स्पीच) पर एक R.V.C. लागू करें 🎙️ + + 1. `कस्टम आवाज़ आर.वी.सी.` टैब में, आपको आवश्यक मॉडल डाउनलोड करने की आवश्यकता है 📥 आप हग्गिंग फेस और गूगल ड्राइव से लिंक्स का उपयोग कर सकते हैं जैसे zip, pth, या इंडेक्स के प्रारूप में। आप पूरे एचएफ स्पेस रिपॉज़िटरी को भी डाउनलोड कर सकते हैं, लेकिन यह विकल्प बहुत ही अस्थिर है 😕 + + 2. अब, `आवाज़ बदलें: TTS से R.V.C.` पर जाएं और `सक्रिय` बॉक्स को चेक करें ✅ इसके बाद, आप प्रत्येक TTS बोलने वाले को लागू करने के लिए जो आप चाहते हैं उसे चुन सकते हैं 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. सभी R.V.C. पर लागू किया जाने वाला F0 विधि समायोजित करें 🎛️ + + 4. आपके द्वारा किए गए परिवर्तनों को लागू करने के लिए `आवेदन को लागू करें` दबाएं 🔄 + + 5. वीडियो अनुवाद टैब पर वापस जाएं और 'अनुवाद करें' पर क्लिक करें ▶️ अब, अनुवाद R.V.C. को लागू करते हुए किया जाएगा। 🗣️ + + सुझाव: आप `टेस्ट R.V.C.` का उपयोग करके प्रयोग कर सकते हैं और R.V.C. को लागू करने के लिए सर्वोत्तम TTS या कॉन्फ़िगरेशन खोज सकते हैं। 🧪🔍 + + --- + + """, + "tab_translate": "वीडियो अनुवाद", + "video_source": "वीडियो स्रोत चुनें", + "link_label": "मीडिया लिंक।", + "link_info": "उदाहरण: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL यहाँ डालें...", + "dir_label": "वीडियो पथ।", + "dir_info": "उदाहरण: /usr/home/my_video.mp4", + "dir_ph": "पथ यहाँ डालें...", + "sl_label": "स्रोत भाषा", + "sl_info": "यह वीडियो की मूल भाषा है", + "tat_label": "ऑडियो को अनुवाद करें", + "tat_info": "लक्ष्य भाषा का चयन करें और सुनिश्चित करें कि उस भाषा के लिए संबंधित TTS चुना गया है।", + "num_speakers": "वीडियो में कितने लोग बोल रहे हैं, उन्हें चुनें।", + "min_sk": "न्यूनतम बोलने वाले", + "max_sk": "अधिकतम बोलने वाले", + "tts_select": "प्रत्येक बोलने वाले के लिए आप जो आवाज़ चाहते हैं, उसे चुनें।", + "sk1": "TTS बोलने वाला 1", + "sk2": "TTS बोलने वाला 2", + "sk3": "TTS बोलने वाला 3", + "sk4": "TTS बोलने वाला 4", + "sk5": "TTS बोलने वाला 5", + "sk6": "TTS बोलने वाला 6", + "sk7": "TTS बोलने वाला 7", + "sk8": "TTS बोलने वाला 8", + "sk9": "TTS बोलने वाला 9", + "sk10": "TTS बोलने वाला 10", + "sk11": "TTS बोलने वाला 11", + "sk12": "TTS बोलने वाला 12", + "vc_title": "विभिन्न भाषाओं में आवाज़ का नकल", + "vc_subtitle": """ + ### विभिन्न भाषाओं में एक व्यक्ति की आवाज़ का नकल। + जब सही ढंग से प्रयोग किया जाता है, तो अधिकांश आवाज़ों के साथ प्रभावी है, लेकिन हर मामले में पूर्णता को हासिल नहीं कर सकता है। + आवाज़ का नकल केवल संदर्भ वक्ता के टोन को प्रतिलिपि करता है, एक्सेंट और भावना को बाहर करता है, जो आधार वक्ता TTS मॉडल द्वारा नियंत्रित होता है और कनवर्टर द्वारा प्रतिलिपि नहीं किया जाता है। + यह मुख्य ऑडियो के लिए ऑडियो नमूने लेता है और प्रसंस्करण करता है। + """, + "vc_active_label": "सक्रिय आवाज़ का नकल", + "vc_active_info": "सक्रिय आवाज़ का नकल: मूल बोलने वाले के टोन को प्रतिलिपि करता है", + "vc_method_label": "विधि", + "vc_method_info": "आवाज़ का नकल प्रक्रिया के लिए एक विधि का चयन करें", + "vc_segments_label": "अधिकतम सैंपल", + "vc_segments_info": "अधिकतम सैंपल: प्रक्रिया के लिए ऑडियो सैंपलों की संख्या है, अधिक बेहतर है, लेकिन यह शोर जोड़ सकता है", + "vc_dereverb_label": "Dereverb", + "vc_dereverb_info": "Dereverb: ऑडियो सैंपलों पर ध्वनि dereverb लागू करता है।", + "vc_remove_label": "पिछले सैंपल हटाएं", + "vc_remove_info": "पिछले सैंपल हटाएं: पिछले सैंपल हटा देता है: ताकि नए सैंपल उत्पन्न किए जाने की आवश्यकता हो।", + "xtts_title": "ऑडियो पर आधारित TTS बनाएं", + "xtts_subtitle": "एक ऑडियो फ़ाइल को अधिकतम 10 सेकंड के साथ एक आवाज़ के साथ अपलोड करें। XTTS का उपयोग करके, एक नया TTS बनाया जाएगा जो प्रदान की गई ऑडियो फ़ाइल के समान होगा।", + "xtts_file_label": "आवाज़ के साथ एक छोटा ऑडियो अपलोड करें", + "xtts_name_label": "TTS के लिए नाम", + "xtts_name_info": "एक सरल नाम का उपयोग करें", + "xtts_dereverb_label": "Dereverb ऑडियो", + "xtts_dereverb_info": "Dereverb ऑडियो: ऑडियो पर ध्वनि dereverb लागू करें", + "xtts_button": "ऑडियो प्रक्रिया करें और इसे TTS सेलेक्टर में शामिल करें", + "xtts_footer": "स्वचालित रूप से आवाज़ xtts उत्पन्न करें: अनुवाद उत्पन्न करते समय प्रत्येक बोलने वाले के लिए सेगमेंट ऑटोमेटिकली उत्पन्न करने के लिए आप `_XTTS_/AUTOMATIC.wav` का उपयोग कर सकते हैं।", + "extra_setting": "उन्नत सेटिंग्स", + "acc_max_label": "अधिकतम ऑडियो त्वरण", + "acc_max_info": "ओवरलैपिंग से बचने के लिए अनुवादित ऑडियो सेगमेंटों के लिए अधिकतम त्वरण। 1.0 का मान कोई त्वरण नहीं दर्शाता है।", + "acc_rate_label": "त्वरण दर नियामक", + "acc_rate_info": "त्वरण दर नियामक: त्वरण को समायोजित करता है ताकि उपभागों को उससे कम गति की आवश्यकता हो, सततता को बनाए रखते हुए और अगले आरंभ के समय को ध्यान में रखते हुए।", + "or_label": "ओवरलैप कमी करना", + "or_info": "ओवरलैप कमी करना: पिछले समाप्ति समयों के आधार पर शुरुआत समयों को समायोजित करके सेगमेंट को ओवरलैप नहीं होने देता है; समवारण को बिगाड़ सकता है।", + "aud_mix_label": "ऑडियो मिश्रण विधि", + "aud_mix_info": "मूल और अनुवादित ऑडियो फ़ाइलों को मिश्रित करें और दो उपलब्ध मिश्रण मोड के साथ एक अनुकूलित, संतुलित उत्पादन बनाएं।", + "vol_ori": "मूल ऑडियो ध्वनि", + "vol_tra": "अनुवादित ऑडियो ध्वनि", + "voiceless_tk_label": "वॉइसलेस ट्रैक", + "voiceless_tk_info": "अनुवादित ऑडियो के साथ इसे मिलाने से पहले मूल ऑडियो ध्वनियों को हटाएं।", + "sub_type": "उपशीर्षक प्रकार", + "soft_subs_label": "मुलायम सबटाइटल्स", + "soft_subs_info": "मुलायम सबटाइटल्स: व्यूअर्स वीडियो देखते समय चाहें तो चालू या बंद कर सकते हैं।", + "burn_subs_label": "उपशीर्षक जलाएं", + "burn_subs_info": "उपशीर्षक जलाएं: वीडियो में उपशीर्षक एम्बेड करें, जिससे वे दृश्यीय सामग्री का स्थायी हिस्सा बन जाएं।", + "whisper_title": "कॉन्फ़िगर ट्रांस्क्रिप्शन।", + "lnum_label": "संख्याओं का वाचक रूपांतरण", + "lnum_info": "संख्याओं का वाचक रूपांतरण: संख्यात्मक प्रतिनिधित्वों को उनके लेखित समकक्षों से प्रतिस्थापित करें ट्रांसक्रिप्ट में।", + "scle_label": "ध्वनि की सफाई", + "scle_info": "ध्वनि की सफाई: अधिकतम समयचिह्न सटीकता के लिए ध्वनि को बेहतर बनाएं, समय चिह्नों की अधिकता के लिए अधिकतम समयचिह्न सटीकता के लिए पीछे की ध्वनि हटाएं। इस ऑपरेशन में समय लग सकता है, खासकर लंबे ऑडियो फ़ाइलों के साथ।", + "sd_limit_label": "सेगमेंट अवधि सीमा", + "sd_limit_info": "प्रत्येक सेगमेंट की अधिकतम अवधि (सेकंड में) को निर्दिष्ट करें। ऑडियो को वैड का उपयोग करके प्रोसेस किया जाएगा, प्रत्येक सेगमेंट चंक की अवधि को सीमित करके।", + "asr_model_info": "यह डिफ़ॉल्ट रूप से बोली भाषा को पाठ में परिवर्तित करता है 'व्हिस्पर मॉडल' का उपयोग करके। अपना कस्टम मॉडल उपयोग करें, उदाहरण के लिए, ड्रॉपडाउन में रिपॉज़िटरी नाम 'BELLE-2/Belle-whisper-large-v3-zh' दर्ज करके एक चीनी भाषा फ़ाइन ट्यून मॉडल का उपयोग करें। Hugging Face पर फ़ाइन ट्यून मॉडल्स पाएँ।", + "ctype_label": "हिसाब प्रकार", + "ctype_info": "छोटे प्रकारों जैसे int8 या फ़्लोट16 का चयन करना प्रदर्शन को बढ़ावा दे सकता है, मेमोरी उपयोग को कम करके और गणनात्मक परिचालन बढ़ाकर प्रदर्शन को सुधार सकता है, लेकिन float32 जैसे बड़े डेटा प्रकारों की तुलना में निश्चितता को कट्टरता में बदल सकता है।", + "batchz_label": "बैच का आकार", + "batchz_info": "यदि आपके पास कम VRAM वाली जीपीयू है, तो बैच का आकार कम करने से मेमोरी बचाई जा सकती है और मेमोरी की कमी की समस्याओं का प्रबंधन किया जा सकता है।", + "tsscale_label": "पाठ के विभाजन का पैमाना", + "tsscale_info": "पाठ को वाक्य, शब्द या अक्षरों के आधार पर खंडों में विभाजित करें। शब्द और अक्षर विभाजन और लघु ग्रेन्युलरिटी प्रदान करता है, जो उपशीर्षकों के लिए उपयोगी है; अनुवाद को अक्षम करने से मूल संरचना को संरक्षित रखा जाता है।", + "srt_file_label": "एक SRT उपशीर्षक फ़ाइल अपलोड करें (विस्पर की प्रतिलिपि के बजाय इस्तेमाल की जाएगी)", + "divide_text_label": "पुनः विभाजित करें टेक्स्ट सेगमेंट द्वारा:", + "divide_text_info": "(प्रयोगात्मक) मौजूदा पाठ सेगमेंट को विभाजित करने के लिए एक विभाजक दर्ज करें। उपकरण को घटनाओं को पहचानने और उन्हें अनुसार नए सेगमेंट बनाने के लिए। | का उपयोग करके एक से अधिक विभाजक निर्दिष्ट करें, उदा।: !|?|...|。", + "diarization_label": "डायरिज़ेशन मॉडल", + "tr_process_label": "अनुवाद प्रक्रिया", + "out_type_label": "आउटपुट प्रकार", + "out_name_label": "फ़ाइल का नाम", + "out_name_info": "आउटपुट फ़ाइल का नाम", + "task_sound_label": "कार्य स्थिति ध्वनि", + "task_sound_info": "कार्य स्थिति ध्वनि: कार्य समाप्ति या क्रिया के दौरान त्रुटियों की सूचना देने वाला ध्वनि चलाता है।", + "cache_label": "प्रगति पुनः प्राप्त करें", + "cache_info": "प्रगति पुनः प्राप्त करें: पिछले चेकप्वाइंट से प्रक्रिया जारी रखें।", + "preview_info": "पूर्णत: अधिकतम 10 सेकंड के लिए वीडियो काटता है परीक्षण के उद्देश्यों के लिए। कृपया इसे निष्क्रिय करें ताकि पूरा वीडियो अवधि प्राप्त की जा सके।", + "edit_sub_label": "उत्पन्न उपशीर्षक संपादित करें", + "edit_sub_info": "उत्पन्न उपशीर्षक संपादित करें: आपको 2 चरणों में अनुवाद चलाने की अनुमति देता है। पहले 'GET SUBTITLES AND EDIT' बटन के साथ, आप उन्हें संपादित करने के लिए उपशीर्षक प्राप्त करते हैं, और फिर 'TRANSLATE' बटन के साथ, आप वीडियो उत्पन्न कर सकते हैं", + "button_subs": "GET SUBTITLES AND EDIT", + "editor_sub_label": "उत्पन्न उपशीर्षक", + "editor_sub_info": "यहाँ उत्पन्न उपशीर्षक में पाठ संपादित करने के लिए स्वतंत्र महसूस करें। आप इंटरफ़ेस विकल्पों में परिवर्तन कर सकते हैं, 'TRANSLATE' बटन पर क्लिक करने से पहले, 'Source language', 'Translate audio to' और 'Max speakers', त्रुटियों से बचने के लिए, 'TRANSLATE' बटन पर क्लिक करें। जब आप समाप्त हो जाएं, 'TRANSLATE' बटन पर क्लिक करें।", + "editor_sub_ph": "पहले 'GET SUBTITLES AND EDIT' दबाएं ताकि उपशीर्षक प्राप्त हो", + "button_translate": "TRANSLATE", + "output_result_label": "अनुवादित वीडियो डाउनलोड करें", + "sub_ori": "उपशीर्षक", + "sub_tra": "अनुवादित उपशीर्षक", + "ht_token_info": "एक महत्वपूर्ण कदम है प्यानोट का उपयोग करने के लिए लाइसेंस समझ। आपको Hugging Face पर एक खाता होना चाहिए और मॉडल का उपयोग करने के लिए लाइसेंस स्वीकार करने की आवश्यकता है: https://huggingface.co/pyannote/speaker-diarization और https://huggingface.co/pyannote/segmentation। अपना KEY TOKEN यहाँ प्राप्त करें: https://hf.co/settings/tokens", + "ht_token_ph": "टोकन यहाँ जाता है...", + "tab_docs": "दस्तावेज़ अनुवाद", + "docs_input_label": "दस्तावेज़ स्रोत चुनें", + "docs_input_info": "यह PDF, DOCX, TXT, या पाठ हो सकता है", + "docs_source_info": "यह पाठ की मूल भाषा है", + "chunk_size_label": "प्रति सेगमेंट TTS द्वारा प्रसंस्कृत किए जाने वाले अधिकतम अक्षरों की संख्या", + "chunk_size_info": "0 का मान एक गतिशील और और संगतिपूर्ण मान को TTS के लिए सौंपता है।", + "docs_button": "भाषा परिवर्तन सेतु शुरू करें", + "cv_url_info": "URL से R.V.C. मॉडल आपमूर्त डाउनलोड करें। आप HuggingFace या Drive से लिंक का उपयोग कर सकते हैं, और आप कई लिंक शामिल कर सकते हैं, प्रत्येक को अल्पविराम द्वारा अलग किया जा सकता है। उदाहरण: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "आवाज़ को बदलें: TTS से R.V.C.", + "sec1_title": "### 1. इसका उपयोग सक्षम करने के लिए, इसे सक्षम करें के रूप में चिह्नित करें।", + "enable_replace": "मॉडल का उपयोग सक्षम करने के लिए इसे चेक करें।", + "sec2_title": "### 2. प्रत्येक संबंधित बोलने वाले के प्रत्येक TTS को लागू करने के लिए एक आवाज़ का चयन करें और विन्यास लागू करें।", + "sec2_subtitle": "आपके पास कितने हैं, इस पर निर्भर करता है, प्रत्येक को उसका संबंधित मॉडल चाहिए। विशेषज्ञ नहीं पाया गया है।", + "cv_tts1": "बोलने वाले 1 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts2": "बोलने वाले 2 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts3": "बोलने वाले 3 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts4": "बोलने वाले 4 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts5": "बोलने वाले 5 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts6": "बोलने वाले 6 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts7": "बोलने वाले 7 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts8": "बोलने वाले 8 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts9": "बोलने वाले 9 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts10": "बोलने वाले 10 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts11": "बोलने वाले 11 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_tts12": "बोलने वाले 12 के लिए लागू करने के लिए आवाज़ चुनें।", + "cv_aux": "- यदि किसी कारणवश स्पीकर सही ढंग से पहचाना नहीं गया है, तो लागू करने के लिए आवाज़।", + "cv_button_apply": "आवेदन को लागू करें", + "tab_help": "सहायता", + }, + "vietnamese": { + "description": """ + ### 🎥 **Dịch video dễ dàng với SoniTranslate!** 📽️ + + Tải lên một video, tập tin âm thanh hoặc cung cấp một liên kết YouTube. 📽️ **Nhận sổ tay cập nhật từ kho chính thức: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Xem tab `Trợ giúp` để biết hướng dẫn cách sử dụng. Hãy bắt đầu vui vẻ với việc dịch video! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Hướng dẫn sử dụng:** + + 1. 📤 Tải lên một **video**, **tập tin âm thanh** hoặc cung cấp một 🌐 **liên kết YouTube**. + + 2. 🌍 Chọn ngôn ngữ bạn muốn **dịch video** sang. + + 3. 🗣️ Chỉ định **số người nói** trong video và **gán mỗi người một giọng nói chuyển văn bản** phù hợp cho ngôn ngữ dịch. + + 4. 🚀 Nhấn nút '**Dịch**' để nhận kết quả. + + --- + + # 🧩 **SoniTranslate hỗ trợ các công cụ TTS (Text-to-Speech) khác nhau, bao gồm:** + - EDGE-TTS → định dạng `en-AU-WilliamNeural-Male` → Nhanh và chính xác. + - FACEBOOK MMS → định dạng `en-facebook-mms VITS` → Giọng nói tự nhiên hơn; hiện tại chỉ sử dụng CPU. + - PIPER TTS → định dạng `en_US-lessac-high VITS-onnx` → Giống như cái trước, nhưng được tối ưu hóa cho cả CPU và GPU. + - BARK → định dạng `en_speaker_0-Male BARK` → Chất lượng tốt nhưng chậm, và dễ bị ảo giác. + - OpenAI TTS → định dạng `>alloy OpenAI-TTS` → Đa ngôn ngữ nhưng cần một OpenAI API key + - Coqui XTTS → định dạng `_XTTS_/AUTOMATIC.wav` → Chỉ có sẵn cho tiếng Trung (Giản thể), tiếng Anh, tiếng Pháp, tiếng Đức, tiếng Ý, tiếng Bồ Đào Nha, tiếng Ba Lan, tiếng Thổ Nhĩ Kỳ, tiếng Nga, tiếng Hà Lan, tiếng Séc, tiếng Ả Rập, tiếng Tây Ban Nha, tiếng Hungary, tiếng Hàn và tiếng Nhật. + + --- + + # 🎤 Cách Sử Dụng Giọng R.V.C. và R.V.C.2 (Tùy chọn) 🎶 + + Mục tiêu là áp dụng một R.V.C. vào TTS (Text-to-Speech) được tạo ra 🎙️ + + 1. Trong tab `Giọng Tùy chỉnh R.V.C.`, tải xuống các mô hình bạn cần 📥 Bạn có thể sử dụng các liên kết từ Hugging Face và Google Drive ở các định dạng như zip, pth, hoặc index. Bạn cũng có thể tải xuống các kho HF hoàn chỉnh, nhưng tùy chọn này không ổn định lắm 😕 + + 2. Bây giờ, đi đến `Thay thế giọng: TTS thành R.V.C.` và đánh dấu vào hộp `enable` ✅ Sau đó, bạn có thể chọn các mô hình bạn muốn áp dụng cho mỗi người nói TTS 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Điều chỉnh phương pháp F0 sẽ được áp dụng cho tất cả R.V.C. 🎛️ + + 4. Nhấn `APPLY CONFIGURATION` để áp dụng các thay đổi bạn đã thực hiện 🔄 + + 5. Quay lại tab dịch video và nhấp vào 'Dịch' ▶️ Bây giờ, dịch sẽ được thực hiện áp dụng R.V.C. 🗣️ + + Mẹo: Bạn có thể sử dụng `Kiểm tra R.V.C.` để thử nghiệm và tìm ra các TTS hoặc cấu hình tốt nhất để áp dụng vào R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "Dịch video", + "video_source": "Chọn Nguồn Video", + "link_label": "Liên kết truyền thông.", + "link_info": "Ví dụ: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "Đường dẫn URL vào đây...", + "dir_label": "Đường dẫn Video.", + "dir_info": "Ví dụ: /usr/home/my_video.mp4", + "dir_ph": "Đường dẫn vào đây...", + "sl_label": "Ngôn ngữ nguồn", + "sl_info": "Đây là ngôn ngữ gốc của video", + "tat_label": "Dịch âm thanh thành", + "tat_info": "Chọn ngôn ngữ đích và đồng thời đảm bảo chọn TTS tương ứng cho ngôn ngữ đó.", + "num_speakers": "Chọn số người nói trong video.", + "min_sk": "Ít người nói", + "max_sk": "Nhiều người nói", + "tts_select": "Chọn giọng bạn muốn cho mỗi người nói.", + "sk1": "Người Nói TTS 1", + "sk2": "Người Nói TTS 2", + "sk3": "Người Nói TTS 3", + "sk4": "Người Nói TTS 4", + "sk5": "Người Nói TTS 5", + "sk6": "Người Nói TTS 6", + "sk7": "Người Nói TTS 7", + "sk8": "Người Nói TTS 8", + "sk9": "Người Nói TTS 9", + "sk10": "Người Nói TTS 10", + "sk11": "Người Nói TTS 11", + "sk12": "Người Nói TTS 12", + "vc_title": "Sao chép giọng nói trong các ngôn ngữ khác nhau", + "vc_subtitle": """ + ### Sao chép giọng nói của một người qua các ngôn ngữ khác nhau. + Mặc dù hiệu quả với hầu hết các giọng nói khi sử dụng một cách phù hợp, nhưng không phải lúc nào cũng đạt được sự hoàn hảo trong mọi trường hợp. + Sao chép giọng nói chỉ sao chép âm sắc của người tham chiếu, loại bỏ giọng địa phương và cảm xúc, được quản lý bởi mô hình TTS cơ bản và không được sao chép bởi bộ chuyển đổi. + Điều này sẽ lấy mẫu âm thanh từ âm thanh chính cho mỗi người nói và xử lý chúng. + """, + "vc_active_label": "Kích hoạt Sao chép Giọng nói", + "vc_active_info": "Kích hoạt Sao chép Giọng nói: Sao chép âm sắc của người nói gốc", + "vc_method_label": "Phương pháp", + "vc_method_info": "Chọn một phương pháp cho quá trình Sao chép Giọng nói", + "vc_segments_label": "Mẫu tối đa", + "vc_segments_info": "Mẫu tối đa: Là số lượng mẫu âm thanh sẽ được tạo ra cho quá trình, càng nhiều càng tốt nhưng có thể thêm tiếng ồn", + "vc_dereverb_label": "Loại bỏ tiếng vang", + "vc_dereverb_info": "Loại bỏ tiếng vang: Áp dụng loại bỏ tiếng vang vào các mẫu âm thanh.", + "vc_remove_label": "Loại bỏ các mẫu trước", + "vc_remove_info": "Loại bỏ các mẫu trước: Loại bỏ các mẫu đã tạo trước đó, vì vậy cần tạo mới.", + "xtts_title": "Tạo TTS dựa trên một âm thanh", + "xtts_subtitle": "Tải lên một tập tin âm thanh tối đa 10 giây với một giọng nói. Sử dụng XTTS, một TTS mới sẽ được tạo ra với một giọng nói tương tự như tập tin âm thanh được cung cấp.", + "xtts_file_label": "Tải lên một âm thanh ngắn với giọng nói", + "xtts_name_label": "Tên cho TTS", + "xtts_name_info": "Sử dụng một tên đơn giản", + "xtts_dereverb_label": "Loại bỏ tiếng vang âm thanh", + "xtts_dereverb_info": "Loại bỏ tiếng vang âm thanh: Áp dụng loại bỏ tiếng vang âm thanh", + "xtts_button": "Xử lý âm thanh và bao gồm nó trong trình chọn TTS", + "xtts_footer": "Tạo TTS giọng nói tự động: Bạn có thể sử dụng `_XTTS_/AUTOMATIC.wav` trong trình chọn TTS để tự động tạo các đoạn cho mỗi người nói khi tạo dịch.", + "extra_setting": "Cài Đặt Nâng Cao", + "acc_max_label": "Tăng tốc âm thanh tối đa", + "acc_max_info": "Tăng tốc tối đa cho các đoạn âm thanh dịch để tránh chồng chéo. Giá trị 1.0 đại diện cho không tăng tốc", + "acc_rate_label": "Điều Chỉnh Tốc Độ Tăng Tốc", + "acc_rate_info": "Điều Chỉnh Tốc Độ Tăng Tốc: Điều chỉnh tốc độ tăng tốc để phù hợp với các đoạn yêu cầu tốc độ thấp hơn, duy trì liên tục và xem xét thời gian bắt đầu tiếp theo.", + "or_label": "Giảm chồng chéo", + "or_info": "Giảm chồng chéo: Đảm bảo các đoạn không chồng chéo bằng cách điều chỉnh thời gian bắt đầu dựa trên thời gian kết thúc trước đó; có thể làm gián đoạn đồng bộ hóa.", + "aud_mix_label": "Phương pháp Trộn Âm thanh", + "aud_mix_info": "Trộn các tập tin âm thanh gốc và dịch để tạo ra một đầu ra cân bằng tùy chỉnh với hai chế độ trộn có sẵn.", + "vol_ori": "Âm lượng âm thanh gốc", + "vol_tra": "Âm lượng âm thanh dịch", + "voiceless_tk_label": "Dạng Dữ liệu Không Có Giọng Nói", + "voiceless_tk_info": "Dạng Dữ liệu Không Có Giọng Nói: Loại bỏ các giọng nói âm thanh gốc trước khi kết hợp nó với âm thanh dịch.", + "sub_type": "Loại Phụ Đề", + "soft_subs_label": "Phụ Đề Mềm", + "soft_subs_info": "Phụ Đề Mềm: Phụ đề tùy chọn mà người xem có thể bật hoặc tắt trong khi xem video.", + "burn_subs_label": "Đốt Phụ đề", + "burn_subs_info": "Đốt Phụ đề: Nhúng phụ đề vào video, biến chúng thành một phần cố định của nội dung hình ảnh.", + "whisper_title": "Cấu hình chuyển đổi.", + "lnum_label": "Biểu Diễn Số Bằng Chữ", + "lnum_info": "Biểu Diễn Số Bằng Chữ: Thay thế các biểu diễn số thành các tương đương viết của chúng trong bản ghi âm.", + "scle_label": "Dọn Dẹp Âm Thanh", + "scle_info": "Dọn Dẹp Âm Thanh: Nâng cao giọng nói, loại bỏ tiếng ồn nền trước khi chuyển đổi để đạt được độ chính xác cao nhất về dấu thời gian. Thao tác này có thể mất thời gian, đặc biệt là với các tệp âm thanh dài.", + "sd_limit_label": "Giới Hạn Thời Lượng Đoạn", + "sd_limit_info": "Chỉ định thời lượng tối đa (theo giây) cho mỗi đoạn. Âm thanh sẽ được xử lý bằng cách sử dụng VAD, giới hạn thời lượng cho mỗi đoạn.", + "asr_model_info": "Nó chuyển đổi ngôn ngữ nói thành văn bản bằng cách sử dụng mô hình 'Whisper' theo mặc định. Sử dụng một mô hình tùy chỉnh, ví dụ, bằng cách nhập tên kho 'BELLE-2/Belle-whisper-large-v3-zh' trong danh sách thả xuống để sử dụng một mô hình đã được điều chỉnh cho ngôn ngữ Trung Quốc. Tìm mô hình đã điều chỉnh trên Hugging Face.", + "ctype_label": "Loại Tính Toán", + "ctype_info": "Lựa chọn các loại nhỏ hơn như int8 hoặc float16 có thể cải thiện hiệu suất bằng cách giảm việc sử dụng bộ nhớ và tăng thông lượng tính toán, nhưng có thể hy sinh độ chính xác so với các loại dữ liệu lớn hơn như float32.", + "batchz_label": "Kích Thước Lô", + "batchz_info": "Giảm kích thước lô giúp tiết kiệm bộ nhớ nếu GPU của bạn có ít VRAM và giúp quản lý các vấn đề Cạn Kiệt Bộ Nhớ.", + "tsscale_label": "Thước Đo Phân Đoạn Văn Bản", + "tsscale_info": "Chia văn bản thành các đoạn theo câu, từ hoặc ký tự. Phân đoạn theo từng từ và ký tự cung cấp độ mịn hơn, hữu ích cho phụ đề; vô hiệu hóa dịch thuật bảo tồn cấu trúc gốc.", + "srt_file_label": "Tải lên một tập tin phụ đề SRT (sẽ được sử dụng thay vì việc chuyển đổi của Whisper)", + "divide_text_label": "Chia lại đoạn văn bản bằng:", + "divide_text_info": "(Thử nghiệm) Nhập một bộ phân cách để chia các đoạn văn bản hiện có trong ngôn ngữ nguồn. Công cụ sẽ nhận dạng các xuất hiện và tạo ra các đoạn mới tương ứng. Chỉ định nhiều bộ phân cách bằng |, ví dụ: !|?|...|。", + "diarization_label": "Mô hình Phân tích", + "tr_process_label": "Quy trình Dịch", + "out_type_label": "Loại Đầu ra", + "out_name_label": "Tên tập tin", + "out_name_info": "Tên của tập tin đầu ra", + "task_sound_label": "Âm thanh Trạng thái Nhiệm vụ", + "task_sound_info": "Âm thanh Trạng thái Nhiệm vụ: Phát ra một cảnh báo âm thanh cho biết nhiệm vụ đã hoàn thành hoặc có lỗi trong quá trình thực thi.", + "cache_label": "Lấy Tiến Trình", + "cache_info": "Lấy Tiến Trình: Tiếp tục quá trình từ điểm kiểm soát cuối cùng.", + "preview_info": "Xem trước cắt video chỉ 10 giây cho mục đích kiểm tra. Vui lòng tắt nó để lấy lại độ dài video đầy đủ.", + "edit_sub_label": "Chỉnh sửa phụ đề đã tạo", + "edit_sub_info": "Chỉnh sửa phụ đề đã tạo: Cho phép bạn chạy dịch trong 2 bước. Đầu tiên với nút 'NHẬN PHỤ ĐỀ VÀ CHỈNH SỬA', bạn nhận được phụ đề để chỉnh sửa chúng, và sau đó với nút 'DỊCH', bạn có thể tạo ra video", + "button_subs": "NHẬN PHỤ ĐỀ VÀ CHỈNH SỬA", + "editor_sub_label": "Phụ đề đã tạo", + "editor_sub_info": "Hãy tự do chỉnh sửa văn bản trong phụ đề đã tạo ở đây. Bạn có thể thay đổi các tùy chọn giao diện trước khi nhấn nút 'DỊCH', ngoại trừ 'Ngôn ngữ nguồn', 'Dịch âm thanh thành', và 'Số người nói tối đa', để tránh lỗi. Khi bạn hoàn thành, nhấn nút 'DỊCH'.", + "editor_sub_ph": "Đầu tiên nhấn 'NHẬN PHỤ ĐỀ VÀ CHỈNH SỬA' để nhận phụ đề", + "button_translate": "DỊCH", + "output_result_label": "TẢI VỀ VIDEO DỊCH", + "sub_ori": "Phụ đề", + "sub_tra": "Phụ đề dịch", + "ht_token_info": "Một bước quan trọng là chấp nhận thỏa thuận giấy phép để sử dụng Pyannote. Bạn cần có một tài khoản trên Hugging Face và chấp nhận thỏa thuận giấy phép để sử dụng các mô hình: https://huggingface.co/pyannote/speaker-diarization và https://huggingface.co/pyannote/segmentation. Lấy KEY TOKEN của bạn tại đây: https://hf.co/settings/tokens", + "ht_token_ph": "Token vào đây...", + "tab_docs": "Dịch tài liệu", + "docs_input_label": "Chọn Nguồn Tài Liệu", + "docs_input_info": "Có thể là PDF, DOCX, TXT, hoặc văn bản", + "docs_source_info": "Đây là ngôn ngữ gốc của văn bản", + "chunk_size_label": "Số ký tự tối đa mà TTS sẽ xử lý cho mỗi đoạn", + "chunk_size_info": "Giá trị 0 gán một giá trị động và tương thích hơn cho TTS.", + "docs_button": "Bắt đầu Cầu Nối Chuyển Đổi Ngôn Ngữ", + "cv_url_info": "Tự động tải xuống các mô hình R.V.C. từ URL. Bạn có thể sử dụng các liên kết từ HuggingFace hoặc Drive, và bạn có thể bao gồm nhiều liên kết, mỗi liên kết cách nhau bằng dấu phẩy. Ví dụ: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Thay thế giọng: TTS thành R.V.C.", + "sec1_title": "### 1. Để kích hoạt việc sử dụng, đánh dấu nó như là kích hoạt.", + "enable_replace": "Kiểm tra điều này để kích hoạt việc sử dụng các mô hình.", + "sec2_title": "### 2. Chọn một giọng nói sẽ được áp dụng cho mỗi TTS của mỗi người nói tương ứng và áp dụng các cấu hình.", + "sec2_subtitle": "Tùy thuộc vào số lượng bạn sẽ sử dụng, mỗi người cần một mô hình tương ứng của mình. Ngoài ra, còn có một mô hình phụ trợ nếu vì một lý do nào đó không nhận diện được người nói đúng cách.", + "cv_tts1": "Chọn giọng nói áp dụng cho Người Nói 1.", + "cv_tts2": "Chọn giọng nói áp dụng cho Người Nói 2.", + "cv_tts3": "Chọn giọng nói áp dụng cho Người Nói 3.", + "cv_tts4": "Chọn giọng nói áp dụng cho Người Nói 4.", + "cv_tts5": "Chọn giọng nói áp dụng cho Người Nói 5.", + "cv_tts6": "Chọn giọng nói áp dụng cho Người Nói 6.", + "cv_tts7": "Chọn giọng nói áp dụng cho Người Nói 7.", + "cv_tts8": "Chọn giọng nói áp dụng cho Người Nói 8.", + "cv_tts9": "Chọn giọng nói áp dụng cho Người Nói 9.", + "cv_tts10": "Chọn giọng nói áp dụng cho Người Nói 10.", + "cv_tts11": "Chọn giọng nói áp dụng cho Người Nói 11.", + "cv_tts12": "Chọn giọng nói áp dụng cho Người Nói 12.", + "cv_aux": "- Giọng nói được áp dụng trong trường hợp không nhận diện được người nói thành công.", + "cv_button_apply": "ÁP DỤNG CẤU HÌNH", + "tab_help": "Trợ giúp", + }, + "polish": { + "description": """ + ### 🎥 **Łatwe tłumaczenie filmów dzięki SoniTranslate!** 📽️ + + Prześlij film, plik dźwiękowy lub podaj link do YouTube. 📽️ **Pobierz aktualny notatnik ze strony oficjalnego repozytorium: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Zobacz zakładkę `Pomoc` w celu uzyskania instrukcji dotyczących korzystania z aplikacji. Zaczynajmy zabawę z tłumaczeniem filmów! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Instrukcje dotyczące użytkowania:** + + 1. 📤 Prześlij **film**, **plik dźwiękowy** lub podaj 🌐 **link do YouTube**. + + 2. 🌍 Wybierz język, na który chcesz **przetłumaczyć film**. + + 3. 🗣️ Określ **liczbę osób mówiących** w filmie i **przypisz każdej z nich odpowiednią syntezę mowy tekstowej (TTS)** odpowiednią dla języka tłumaczenia. + + 4. 🚀 Naciśnij przycisk '**Tłumacz**', aby uzyskać wyniki. + + --- + + # 🧩 **SoniTranslate obsługuje różne silniki TTS (tekst do mowy), które to:** + - EDGE-TTS → format `en-AU-WilliamNeural-Male` → Szybki i dokładny. + - FACEBOOK MMS → format `en-facebook-mms VITS` → Głos jest bardziej naturalny; obecnie wykorzystuje tylko CPU. + - PIPER TTS → format `en_US-lessac-high VITS-onnx` → To samo co poprzednie, ale zoptymalizowane zarówno pod CPU, jak i GPU. + - BARK → format `en_speaker_0-Male BARK` → Dobra jakość, ale wolne działanie, podatne na halucynacje. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Wielojęzyczne, ale wymaga klucza OpenAI API + - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Dostępne tylko dla języka chińskiego (uproszczonego), angielskiego, francuskiego, niemieckiego, włoskiego, portugalskiego, polskiego, tureckiego, rosyjskiego, niderlandzkiego, czeskiego, arabskiego, hiszpańskiego, węgierskiego, koreańskiego i japońskiego. + + --- + + # 🎤 Jak używać głosów R.V.C. i R.V.C.2 (opcjonalnie) 🎶 + + Celem jest zastosowanie R.V.C. do wygenerowanego TTS (tekst do mowy) 🎙️ + + 1. W zakładce `Custom Voice R.V.C.` pobierz potrzebne modele 📥 Możesz użyć linków z Hugging Face i Google Drive w formatach takich jak zip, pth lub index. Możesz również pobrać pełne repozytoria HF Space, ale ta opcja nie jest bardzo stabilna 😕 + + 2. Teraz przejdź do `Zamień głos: TTS na R.V.C.` i zaznacz pole `włącz` ✅ Następnie możesz wybrać modele, które chcesz zastosować do każdego mówcy TTS 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Dostosuj metodę F0, która zostanie zastosowana do wszystkich R.V.C. 🎛️ + + 4. Naciśnij przycisk `ZASTOSUJ KONFIGURACJĘ`, aby zastosować wprowadzone zmiany 🔄 + + 5. Wróć do zakładki tłumaczenia filmu i kliknij 'Tłumacz' ▶️ Teraz tłumaczenie zostanie wykonane, zastosowując R.V.C. 🗣️ + + Wskazówka: Możesz użyć `Test R.V.C.` do eksperymentowania i znalezienia najlepszego TTS lub konfiguracji do zastosowania w R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "Tłumaczenie filmu", + "video_source": "Wybierz Źródło Wideo", + "link_label": "Link do multimediów.", + "link_info": "Przykład: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "Wklej tutaj URL...", + "dir_label": "Ścieżka do Wideo.", + "dir_info": "Przykład: /usr/home/my_video.mp4", + "dir_ph": "Wklej tutaj ścieżkę...", + "sl_label": "Język źródłowy", + "sl_info": "To jest oryginalny język filmu", + "tat_label": "Przetłumacz audio na", + "tat_info": "Wybierz język docelowy i upewnij się, że wybierzesz odpowiednią syntezę mowy tekstowej (TTS) dla tego języka.", + "num_speakers": "Wybierz, ile osób mówi w filmie.", + "min_sk": "Min. mówców", + "max_sk": "Maks. mówców", + "tts_select": "Wybierz głos dla każdego mówcy.", + "sk1": "Głos TTS Mówca 1", + "sk2": "Głos TTS Mówca 2", + "sk3": "Głos TTS Mówca 3", + "sk4": "Głos TTS Mówca 4", + "sk5": "Głos TTS Mówca 5", + "sk6": "Głos TTS Mówca 6", + "sk7": "Głos TTS Mówca 7", + "sk8": "Głos TTS Mówca 8", + "sk9": "Głos TTS Mówca 9", + "sk10": "Głos TTS Mówca 10", + "sk11": "Głos TTS Mówca 11", + "sk12": "Głos TTS Mówca 12", + "vc_title": "Imitacja głosu w różnych językach", + "vc_subtitle": """ + ### Odtwórz głos osoby w różnych językach. + Mimo że jest skuteczny w większości przypadków, nie zawsze osiąga doskonałość. + Imitacja głosu odtwarza tylko ton osoby referencyjnej, wykluczając akcent i emocje, które są kontrolowane przez model TTS podstawowego mówcy i nie są replikowane przez konwerter. + Będzie pobierać próbki dźwiękowe z głównego dźwięku dla każdego mówcy i je przetwarzać. + """, + "vc_active_label": "Aktywna Imitacja Głosu", + "vc_active_info": "Aktywna Imitacja Głosu: Odtwarza ton oryginalnego mówcy", + "vc_method_label": "Metoda", + "vc_method_info": "Wybierz metodę procesu imitacji głosu", + "vc_segments_label": "Maks. liczba próbek", + "vc_segments_info": "Maks. liczba próbek: To jest liczba próbek dźwiękowych, które zostaną wygenerowane w procesie, więcej to lepiej, ale może to wprowadzić hałas", + "vc_dereverb_label": "Usuń pogłos", + "vc_dereverb_info": "Usuń pogłos: Zastosuj usuwanie pogłosu do próbek dźwiękowych.", + "vc_remove_label": "Usuń poprzednie próbki", + "vc_remove_info": "Usuń poprzednie próbki: Usuń wcześniej wygenerowane próbki, więc trzeba będzie wygenerować nowe.", + "xtts_title": "Utwórz TTS na podstawie dźwięku", + "xtts_subtitle": "Prześlij krótki plik dźwiękowy o maksymalnej długości 10 sekund z głosem. Korzystając z XTTS, zostanie utworzony nowy TTS z głosem podobnym do dostarczonego pliku dźwiękowego.", + "xtts_file_label": "Prześlij krótki dźwięk z głosem", + "xtts_name_label": "Nazwa dla TTS", + "xtts_name_info": "Użyj prostej nazwy", + "xtts_dereverb_label": "Usuń pogłos dźwięku", + "xtts_dereverb_info": "Usuń pogłos dźwięku: Zastosuj usuwanie pogłosu do dźwięku", + "xtts_button": "Przetwórz dźwięk i dodaj go do selektora TTS", + "xtts_footer": "Automatycznie generuj głos XTTS: Możesz użyć `_XTTS_/AUTOMATIC.wav` w selektorze TTS, aby automatycznie generować segmenty dla każdego mówcy podczas generowania tłumaczenia.", + "extra_setting": "Ustawienia Zaawansowane", + "acc_max_label": "Maks. przyspieszenie dźwięku", + "acc_max_info": "Maksymalne przyspieszenie dla przetłumaczonych segmentów dźwiękowych, aby uniknąć nakładania się. Wartość 1.0 oznacza brak przyspieszenia", + "acc_rate_label": "Regulacja prędkości przyśpieszania", + "acc_rate_info": "Regulacja prędkości przyśpieszania: Dostosowuje przyśpieszenie, aby dostosować się do segmentów wymagających mniejszej prędkości, zachowując ciągłość i uwzględniając czas następnego startu.", + "or_label": "Redukcja Nakładania", + "or_info": "Redukcja Nakładania: Zapewnia, że segmenty się nie nakładają, poprzez dostosowanie czasów rozpoczęcia na podstawie wcześniejszych czasów zakończenia; może zakłócić synchronizację.", + "aud_mix_label": "Metoda Mieszania Audio", + "aud_mix_info": "Mieszaj pliki audio oryginalne i przetłumaczone, aby utworzyć spersonalizowane, zrównoważone wyjście z dwoma dostępnymi trybami mieszania.", + "vol_ori": "Głośność oryginalnego dźwięku", + "vol_tra": "Głośność przetłumaczonego dźwięku", + "voiceless_tk_label": "Ścieżka bezgłosowa", + "voiceless_tk_info": "Ścieżka bezgłosowa: Usuń głosy oryginalne przed połączeniem ich z przetłumaczonym dźwiękiem.", + "sub_type": "Typ Napisów", + "soft_subs_label": "Miękkie napisy", + "soft_subs_info": "Miękkie napisy: Opcjonalne napisy, które widzowie mogą włączać lub wyłączać podczas oglądania wideo.", + "burn_subs_label": "Wypal napisy", + "burn_subs_info": "Wypal napisy: Osadź napisy w wideo, stając się trwałą częścią treści wizualnej.", + "whisper_title": "Konfiguracja transkrypcji.", + "lnum_label": "Zliteralizuj Liczby", + "lnum_info": "Zliteralizuj Liczby: Zastąp numeryczne reprezentacje ich pisemnymi odpowiednikami w transkrypcji.", + "scle_label": "Oczyszczanie Dźwięku", + "scle_info": "Oczyszczanie Dźwięku: Poprawa głosu, usuwanie szumów tła przed transkrypcją dla najwyższej precyzji znaczników czasowych. Ta operacja może zająć trochę czasu, szczególnie przy długich plikach dźwiękowych.", + "sd_limit_label": "Ograniczenie Czasu Trwania Segmentu", + "sd_limit_info": "Określ maksymalny czas trwania (w sekundach) dla każdego segmentu. Dźwięk będzie przetwarzany za pomocą VAD, ograniczając czas trwania dla każdego fragmentu segmentu.", + "asr_model_info": "Konwertuje mowę na tekst za pomocą modelu „Szept” domyślnie. Użyj niestandardowego modelu, na przykład, wpisując nazwę repozytorium „BELLE-2/Belle-whisper-large-v3-zh” w rozwijanej liście, aby użyć dostosowanego modelu w języku chińskim. Znajdź dostosowane modele na Hugging Face.", + "ctype_label": "Typ Obliczeń", + "ctype_info": "Wybór mniejszych typów, takich jak int8 lub float16, może poprawić wydajność poprzez zmniejszenie użycia pamięci i zwiększenie przepustowości obliczeniowej, ale może poświęcić precyzję w porównaniu do większych typów danych, takich jak float32.", + "batchz_label": "Rozmiar Partii", + "batchz_info": "Zmniejszenie rozmiaru partii oszczędza pamięć, jeśli Twój GPU ma mniej VRAM, i pomaga zarządzać problemami z brakiem pamięci.", + "tsscale_label": "Skala Segmentacji Tekstu", + "tsscale_info": "Podziel tekst na segmenty według zdań, słów lub znaków. Segmentacja według słów i znaków zapewnia drobniejszą granulację, przydatną dla napisów; wyłączenie tłumaczenia zachowuje pierwotną strukturę.", + "srt_file_label": "Prześlij plik napisów SRT (będzie używany zamiast transkrypcji Whisper)", + "divide_text_label": "Podziel segmenty tekstu przez:", + "divide_text_info": "(Eksperymentalne) Wprowadź separator do podziału istniejących segmentów tekstu w języku źródłowym. Narzędzie zidentyfikuje wystąpienia i utworzy nowe segmenty zgodnie z nimi. Wprowadź kilka separatorów, używając |, np.: !|?|...|。", + "diarization_label": "Model diarization", + "tr_process_label": "Proces tłumaczenia", + "out_type_label": "Typ wyjścia", + "out_name_label": "Nazwa pliku", + "out_name_info": "Nazwa pliku wyjściowego", + "task_sound_label": "Dźwięk statusu zadania", + "task_sound_info": "Dźwięk statusu zadania: Odtwarza alert dźwiękowy informujący o zakończeniu zadania lub błędach w trakcie wykonywania.", + "cache_label": "Pobierz postęp", + "cache_info": "Pobierz postęp: Kontynuuj proces od ostatniego punktu kontrolnego.", + "preview_info": "Podgląd przycina wideo do 10 sekund tylko do celów testowych. Proszę wyłączyć go, aby pobrać pełną długość wideo.", + "edit_sub_label": "Edytuj wygenerowane napisy", + "edit_sub_info": "Edytuj wygenerowane napisy: Pozwala uruchomić tłumaczenie w 2 krokach. Najpierw za pomocą przycisku 'POBIERZ NAPISY I EDYTUJ' pobierz napisy, aby je edytować, a następnie za pomocą przycisku 'TRANSLATE' możesz wygenerować wideo", + "button_subs": "POBIERZ NAPISY I EDYTUJ", + "editor_sub_label": "Wygenerowane napisy", + "editor_sub_info": "Zapraszamy do edycji tekstu w wygenerowanych napisach tutaj. Możesz wprowadzić zmiany w opcjach interfejsu przed kliknięciem przycisku 'TRANSLATE', oprócz 'Języka źródłowego', 'Przetłumacz audio na' i 'Maks. mówców', aby uniknąć błędów. Po zakończeniu kliknij przycisk 'TRANSLATE'.", + "editor_sub_ph": "Najpierw naciśnij 'POBIERZ NAPISY I EDYTUJ', aby pobrać napisy", + "button_translate": "TRANSLATE", + "output_result_label": "POBIERZ PRZETŁUMACZONE WIDEO", + "sub_ori": "Napisy oryginalne", + "sub_tra": "Przetłumaczone napisy", + "ht_token_info": "Jednym ważnym krokiem jest zaakceptowanie umowy licencyjnej dotyczącej korzystania z Pyannote. Musisz mieć konto na Hugging Face i zaakceptować licencję do użytkowania modeli: https://huggingface.co/pyannote/speaker-diarization oraz https://huggingface.co/pyannote/segmentation. Pobierz swój KLUCZ TOKEN tutaj: https://hf.co/settings/tokens", + "ht_token_ph": "Wklej tutaj Token...", + "tab_docs": "Tłumaczenie dokumentu", + "docs_input_label": "Wybierz Źródło Dokumentu", + "docs_input_info": "To może być plik PDF, DOCX, TXT lub tekst", + "docs_source_info": "To jest oryginalny język tekstu", + "chunk_size_label": "Maks. liczba znaków, które TTS będzie przetwarzał na segment", + "chunk_size_info": "Wartość 0 przypisuje dynamiczną i bardziej kompatybilną wartość dla TTS.", + "docs_button": "Rozpocznij most konwersji językowej", + "cv_url_info": "Automatycznie pobierz modele R.V.C. z adresu URL. Możesz użyć linków z HuggingFace lub Drive, i możesz dołączyć kilka linków, każdy oddzielony przecinkiem. Przykład: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Zamień głos: TTS na R.V.C.", + "sec1_title": "### 1. Aby włączyć jego użycie, zaznacz go jako aktywny.", + "enable_replace": "Zaznacz to, aby włączyć używanie modeli.", + "sec2_title": "### 2. Wybierz głos, który zostanie zastosowany do każdego TTS każdego odpowiedniego mówcy i zastosuj konfiguracje.", + "sec2_subtitle": "W zależności od liczby , którą będziesz używać, każdy potrzebuje odpowiedniego modelu. Dodatkowo, jest jeden pomocniczy, jeśli z jakiegoś powodu mówca nie zostanie poprawnie wykryty.", + "cv_tts1": "Wybierz głos, który ma być stosowany dla Mówcy 1.", + "cv_tts2": "Wybierz głos, który ma być stosowany dla Mówcy 2.", + "cv_tts3": "Wybierz głos, który ma być stosowany dla Mówcy 3.", + "cv_tts4": "Wybierz głos, który ma być stosowany dla Mówcy 4.", + "cv_tts5": "Wybierz głos, który ma być stosowany dla Mówcy 5.", + "cv_tts6": "Wybierz głos, który ma być stosowany dla Mówcy 6.", + "cv_tts7": "Wybierz głos, który ma być stosowany dla Mówcy 7.", + "cv_tts8": "Wybierz głos, który ma być stosowany dla Mówcy 8.", + "cv_tts9": "Wybierz głos, który ma być stosowany dla Mówcy 9.", + "cv_tts10": "Wybierz głos, który ma być stosowany dla Mówcy 10.", + "cv_tts11": "Wybierz głos, który ma być stosowany dla Mówcy 11.", + "cv_tts12": "Wybierz głos, który ma być stosowany dla Mówcy 12.", + "cv_aux": "- Głos do zastosowania w przypadku niepowodzenia wykrycia Mówcy.", + "cv_button_apply": "ZASTOSUJ KONFIGURACJĘ", + "tab_help": "Pomoc", + }, + "swedish": { + "description": """ + ### 🎥 **Översätt videor enkelt med SoniTranslate!** 📽️ + + Ladda upp en video, ljudfil eller ange en YouTube-länk. 📽️ **Få den uppdaterade anteckningsboken från det officiella arkivet: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Se fliken `Hjälp` för instruktioner om hur du använder det. Nu ska vi ha roligt med videöversättning! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Instruktioner för användning:** + + 1. 📤 Ladda upp en **video**, **ljudfil** eller ange en 🌐 **YouTube-länk.** + + 2. 🌍 Välj det språk du vill **översätta videon till**. + + 3. 🗣️ Ange **antalet personer som talar** i videon och **tilldela var och en en text-till-tal-röst** lämplig för översättningsspråket. + + 4. 🚀 Tryck på knappen '**Översätt**' för att få resultatet. + + --- + + # 🧩 **SoniTranslate stöder olika TTS (Text-to-Speech) motorer, vilka är:** + - EDGE-TTS → format `en-AU-WilliamNeural-Male` → Snabbt och noggrant. + - FACEBOOK MMS → format `en-facebook-mms VITS` → Rösten är mer naturlig; för tillfället använder den endast CPU. + - PIPER TTS → format `en_US-lessac-high VITS-onnx` → Samma som den föregående, men den är optimerad för både CPU och GPU. + - BARK → format `en_speaker_0-Male BARK` → Bra kvalitet men långsam och benägen för hallucinationer. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Multispråkigt men kräver en OpenAI API-nyckel + - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Endast tillgängligt för kinesiska (förenklad), engelska, franska, tyska, italienska, portugisiska, polska, turkiska, ryska, nederländska, tjeckiska, arabiska, spanska, ungerska, koreanska och japanska. + + --- + + # 🎤 Hur man använder R.V.C. och R.V.C.2-röster (Valfritt) 🎶 + + Målet är att tillämpa en R.V.C. på den genererade TTS (Text-to-Speech) 🎙️ + + 1. I fliken `Anpassad röst R.V.C.`, ladda ner de modeller du behöver 📥 Du kan använda länkar från Hugging Face och Google Drive i format som zip, pth eller index. Du kan också ladda ner kompletta HF-utrymmen, men den här alternativet är inte särskilt stabilt 😕 + + 2. Gå nu till `Ersätt röst: TTS till R.V.C.` och markera rutan `aktivera` ✅ Efter det kan du välja de modeller du vill tillämpa på varje TTS-högtalare 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Justera F0-metoden som kommer att tillämpas på alla R.V.C. 🎛️ + + 4. Tryck på `TILLÄMPA KONFIGURATION` för att tillämpa de ändringar du gjorde 🔄 + + 5. Gå tillbaka till fliken för videöversättning och klicka på 'Översätt' ▶️ Nu kommer översättningen att göras med tillämpning av R.V.C. 🗣️ + + Tips: Du kan använda `Test R.V.C.` för att experimentera och hitta de bästa TTS eller konfigurationer att tillämpa på R.V.C. 🧪🔍 + + --- + + """, + "tab_translate": "Videöversättning", + "video_source": "Välj Videokälla", + "link_label": "Medialänk.", + "link_info": "Exempel: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL går här...", + "dir_label": "Videostig.", + "dir_info": "Exempel: /usr/home/min_video.mp4", + "dir_ph": "Sökväg går här...", + "sl_label": "Källspråk", + "sl_info": "Detta är det ursprungliga språket för videon", + "tat_label": "Översätt ljud till", + "tat_info": "Välj målspråket och se också till att välja den motsvarande TTS för det språket.", + "num_speakers": "Välj hur många personer som talar i videon.", + "min_sk": "Min högtalare", + "max_sk": "Max högtalare", + "tts_select": "Välj röst för varje högtalare.", + "sk1": "TTS Högtalare 1", + "sk2": "TTS Högtalare 2", + "sk3": "TTS Högtalare 3", + "sk4": "TTS Högtalare 4", + "sk5": "TTS Högtalare 5", + "sk6": "TTS Högtalare 6", + "sk7": "TTS Högtalare 7", + "sk8": "TTS Högtalare 8", + "sk9": "TTS Högtalare 9", + "sk10": "TTS Högtalare 10", + "sk11": "TTS Högtalare 11", + "sk12": "TTS Högtalare 12", + "vc_title": "Röstimitation på olika språk", + "vc_subtitle": """ + ### Replicera en persons röst över olika språk. + Effektiv med de flesta röster när den används på rätt sätt, men den kan inte uppnå perfektion i varje fall. + Röstimitation reproducerar endast referenshögtalarens ton, exklusive accent och känslor, som styrs av basens högtalar-TTS-modell och inte reproduceras av omvandlaren. + Detta kommer att ta ljudprover från huvudljudet för varje högtalare och bearbeta dem. + """, + "vc_active_label": "Aktiv Röstimitation", + "vc_active_info": "Aktiv Röstimitation: Reproducerar den ursprungliga högtalarens ton", + "vc_method_label": "Metod", + "vc_method_info": "Välj en metod för Röstimitationsprocessen", + "vc_segments_label": "Maxprover", + "vc_segments_info": "Maxprover: Är antalet ljudprover som kommer att genereras för processen, fler är bättre men det kan lägga till brus", + "vc_dereverb_label": "Dereverb", + "vc_dereverb_info": "Dereverb: Tillämpar vokal dereverb på ljudproverna.", + "vc_remove_label": "Ta bort tidigare prover", + "vc_remove_info": "Ta bort tidigare prover: Ta bort de tidigare genererade proven, så nya måste skapas.", + "xtts_title": "Skapa en TTS baserad på ett ljud", + "xtts_subtitle": "Ladda upp en ljudfil på maximalt 10 sekunder med en röst. Genom att använda XTTS kommer en ny TTS att skapas med en röst liknande den tillhandahållna ljudfilen.", + "xtts_file_label": "Ladda upp ett kort ljud med rösten", + "xtts_name_label": "Namn för TTS", + "xtts_name_info": "Använd ett enkelt namn", + "xtts_dereverb_label": "Dereverb ljud", + "xtts_dereverb_info": "Dereverb ljud: Tillämpar vokal dereverb på ljudet", + "xtts_button": "Bearbeta ljudet och inkludera det i TTS-väljaren", + "xtts_footer": "Generera röst xtts automatiskt: Du kan använda `_XTTS_/AUTOMATIC.wav` i TTS-väljaren för att automatiskt generera segment för varje högtalare vid generering av översättningen.", + "extra_setting": "Avancerade Inställningar", + "acc_max_label": "Max Ljudacceleration", + "acc_max_info": "Maximal acceleration för översatta ljudsegment för att undvika överlappning. En värde på 1,0 representerar ingen acceleration", + "acc_rate_label": "Accelerationshastighetsreglering", + "acc_rate_info": "Accelerationshastighetsreglering: Justerar accelerationen för att passa segment som kräver lägre hastighet, vilket bibehåller kontinuitet och överväger nästa starttid.", + "or_label": "Överlappningsreducering", + "or_info": "Överlappningsreducering: Säkerställer att segment inte överlappar genom att justera starttider baserat på tidigare sluttider; kan störa synkroniseringen.", + "aud_mix_label": "Ljudmixningsmetod", + "aud_mix_info": "Blanda original- och översatta ljudfiler för att skapa en anpassad, balanserad utdata med två tillgängliga blandningslägen.", + "vol_ori": "Volym ursprungligt ljud", + "vol_tra": "Volym översatt ljud", + "voiceless_tk_label": "Röstlös spår", + "voiceless_tk_info": "Röstlös spår: Ta bort de ursprungliga ljudrösterna innan de kombineras med det översatta ljudet.", + "sub_type": "Undertexttyp", + "soft_subs_label": "Mjuka undertexter", + "soft_subs_info": "Mjuka undertexter: Valfria undertexter som tittare kan slå på eller av medan de tittar på videon.", + "burn_subs_label": "Bränn undertexter", + "burn_subs_info": "Bränn undertexter: Bädda in undertexter i videon, vilket gör dem till en permanent del av det visuella innehållet.", + "whisper_title": "Konfigurera transkription.", + "lnum_label": "Literalisera Siffror", + "lnum_info": "Literalisera Siffror: Ersätt numeriska representationer med deras skrivna motsvarigheter i transkriptet.", + "scle_label": "Ljudstädning", + "scle_info": "Ljudstädning: Förbättra röster, ta bort bakgrundsljud innan transkribering för högsta tidsstämpelprecision. Denna operation kan ta tid, särskilt med långa ljudfiler.", + "sd_limit_label": "Segmentvaraktighetsbegränsning", + "sd_limit_info": "Ange den maximala varaktigheten (i sekunder) för varje segment. Ljudet kommer att bearbetas med VAD och begränsa varaktigheten för varje segmentbit.", + "asr_model_info": "Det konverterar talat språk till text med hjälp av standardmodellen 'Whisper'. Använd en anpassad modell, till exempel genom att ange lagringsnamnet 'BELLE-2/Belle-whisper-large-v3-zh' i rullgardinsmenyn för att använda en anpassad modell för kinesiska. Hitta finjusterade modeller på Hugging Face.", + "ctype_label": "Beräkningstyp", + "ctype_info": "Att välja mindre typer som int8 eller float16 kan förbättra prestanda genom att minska minnesanvändningen och öka den beräkningsmässiga genomströmningen, men kan offra precisionen jämfört med större datatyper som float32.", + "batchz_label": "Batchstorlek", + "batchz_info": "Att minska batchstorleken sparar minne om din GPU har mindre VRAM och hjälper till att hantera minnesproblem.", + "tsscale_label": "Text segmenteringsskala", + "tsscale_info": "Dela upp texten i segment efter meningar, ord eller tecken. Ordet och teckensegmentering ger finare granularitet, användbart för undertexter; inaktivering av översättning bevarar den ursprungliga strukturen.", + "srt_file_label": "Ladda upp en SRT-undertextsfil (kommer att användas istället för Whisper-transkriptionen)", + "divide_text_label": "Dela upp textsegment med:", + "divide_text_info": "(Experimentell) Ange en avgränsare för att dela upp befintliga textsegment på källspråket. Verktyget kommer att identifiera förekomster och skapa nya segment därefter. Ange flera avgränsare med |, t.ex.: !|?|...|。", + "diarization_label": "Diariseringsmodell", + "tr_process_label": "Översättningsprocess", + "out_type_label": "Utgångstyp", + "out_name_label": "Filnamn", + "out_name_info": "Namnet på utdatafilen", + "task_sound_label": "Uppgiftsstatusljud", + "task_sound_info": "Uppgiftsstatusljud: Spelar upp ett ljudlarm som indikerar uppgiftsslutförande eller fel under utförandet.", + "cache_label": "Återställ Framsteg", + "cache_info": "Återställ Framsteg: Fortsätt processen från senaste kontrollpunkt.", + "preview_info": "Förhandsgranskning klipper videon till endast 10 sekunder för teständamål. Avaktivera det för att hämta full videolängd.", + "edit_sub_label": "Redigera genererade undertexter", + "edit_sub_info": "Redigera genererade undertexter: Tillåter dig att köra översättningen i 2 steg. Först med knappen 'FÅ UNDERSKRIFTER OCH REDIGERA', får du undertexterna för att redigera dem, och sedan med knappen 'ÖVERFÖRA', kan du generera videon", + "button_subs": "FÅ UNDERSKRIFTER OCH REDIGERA", + "editor_sub_label": "Genererade undertexter", + "editor_sub_info": "Du kan redigera texten i de genererade undertexterna här. Du kan göra ändringar i gränssnittsalternativen innan du klickar på knappen 'ÖVERFÖRA', förutom 'Källspråk', 'Översätt ljud till' och 'Max högtalare', för att undvika fel. När du är klar, klicka på knappen 'ÖVERFÖRA'.", + "editor_sub_ph": "Tryck först på 'FÅ UNDERSKRIFTER OCH REDIGERA' för att hämta undertexterna", + "button_translate": "ÖVERFÖRA", + "output_result_label": "LADDA NER ÖVERSATT VIDEO", + "sub_ori": "Undertexter", + "sub_tra": "Översatta undertexter", + "ht_token_info": "Ett viktigt steg är att godkänna licensavtalet för att använda Pyannote. Du behöver ha ett konto på Hugging Face och acceptera licensen för att använda modellerna: https://huggingface.co/pyannote/speaker-diarization och https://huggingface.co/pyannote/segmentation. Hämta din NYCKELTOKEN här: https://hf.co/settings/tokens", + "ht_token_ph": "Token går här...", + "tab_docs": "Dokumentöversättning", + "docs_input_label": "Välj Dokumentkälla", + "docs_input_info": "Det kan vara PDF, DOCX, TXT eller text", + "docs_source_info": "Detta är det ursprungliga språket för texten", + "chunk_size_label": "Max antal tecken som TTS kommer att behandla per segment", + "chunk_size_info": "Ett värde på 0 tilldelar ett dynamiskt och mer kompatibelt värde för TTS.", + "docs_button": "Starta Språkomvandlingsbryggan", + "cv_url_info": "Ladda automatiskt ner R.V.C.-modellerna från URL:en. Du kan använda länkar från HuggingFace eller Drive, och du kan inkludera flera länkar, var och en separerad med ett komma. Exempel: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Ersätt röst: TTS till R.V.C.", + "sec1_title": "### 1. För att aktivera dess användning, markera den som aktiverad.", + "enable_replace": "Markera detta för att aktivera användningen av modellerna.", + "sec2_title": "### 2. Välj en röst som ska tillämpas på varje TTS för varje motsvarande högtalare och tillämpa konfigurationerna.", + "sec2_subtitle": "Beroende på hur många du kommer att använda, behöver var och en sin respektive modell. Dessutom finns det en hjälpmodell om högtalaren av någon anledning inte upptäcks korrekt.", + "cv_tts1": "Välj röst att tillämpa för Högtalare 1.", + "cv_tts2": "Välj röst att tillämpa för Högtalare 2.", + "cv_tts3": "Välj röst att tillämpa för Högtalare 3.", + "cv_tts4": "Välj röst att tillämpa för Högtalare 4.", + "cv_tts5": "Välj röst att tillämpa för Högtalare 5.", + "cv_tts6": "Välj röst att tillämpa för Högtalare 6.", + "cv_tts7": "Välj röst att tillämpa för Högtalare 7.", + "cv_tts8": "Välj röst att tillämpa för Högtalare 8.", + "cv_tts9": "Välj röst att tillämpa för Högtalare 9.", + "cv_tts10": "Välj röst att tillämpa för Högtalare 10.", + "cv_tts11": "Välj röst att tillämpa för Högtalare 11.", + "cv_tts12": "Välj röst att tillämpa för Högtalare 12.", + "cv_aux": "- Röst att tillämpa om en högtalare inte upptäcks framgångsrikt.", + "cv_button_apply": "TILLÄMPA KONFIGURATION", + "tab_help": "Hjälp", + }, + "korean": { + "description": """ + ### 🎥 **SoniTranslate를 사용하여 비디오를 쉽게 번역하세요!** 📽️ + + 비디오, 오디오 파일을 업로드하거나 YouTube 링크를 제공하세요. 📽️ **공식 저장소에서 최신 노트북을 받으세요.: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + 사용 방법에 대한 지침은 `도움말` 탭을 참조하세요. 비디오 번역으로 즐거운 시간을 보내세요! 🚀🎉 + """, + "tutorial": """ + # 🔰 **사용 방법:** + + 1. 📤 **비디오**, **오디오 파일**을 업로드하거나 🌐 **YouTube 링크**를 제공하세요. + + 2. 🌍 **비디오를 번역할 언어**를 선택하세요. + + 3. 🗣️ 비디오에서 **말하는 사람 수**를 지정하고 각각을 번역 언어에 적합한 텍스트 음성으로 할당하세요. + + 4. 🚀 '**번역**' 버튼을 눌러 결과를 얻으세요. + + --- + + # 🧩 **SoniTranslate는 다양한 TTS (텍스트 음성 변환) 엔진을 지원합니다. 이는 다음과 같습니다:** + - EDGE-TTS → 형식 `en-AU-WilliamNeural-Male` → 빠르고 정확합니다. + - FACEBOOK MMS → 형식 `en-facebook-mms VITS` → 음성이 더 자연스럽지만 현재 CPU만 사용됩니다. + - PIPER TTS → 형식 `en_US-lessac-high VITS-onnx` → 이전 것과 동일하지만 CPU와 GPU 모두 최적화되었습니다. + - BARK → 형식 `en_speaker_0-Male BARK` → 품질은 좋지만 느리고 환각에 취약합니다. + - OpenAI TTS → 형식 `>alloy OpenAI-TTS` → 다국어지만 OpenAI API 키가 필요합니다 + - Coqui XTTS → 형식 `_XTTS_/AUTOMATIC.wav` → 중국어 (간체), 영어, 프랑스어, 독일어, 이탈리아어, 포르투갈어, 폴란드어, 터키어, 러시아어, 네덜란드어, 체코어, 아랍어, 스페인어, 헝가리어, 한국어 및 일본어만 사용할 수 있습니다. + + --- + + # 🎤 R.V.C. 및 R.V.C.2 음성 사용 방법 (선택 사항) 🎶 + + 목표는 생성된 TTS (텍스트 음성 변환)에 R.V.C.를 적용하는 것입니다. 🎙️ + + 1. `Custom Voice R.V.C.` 탭에서 필요한 모델을 다운로드하세요. 📥 Hugging Face 및 Google Drive에서 zip, pth 또는 index와 같은 형식의 링크를 사용할 수 있습니다. HF 공간 저장소 전체를 다운로드할 수도 있지만 이 옵션은 안정성이 떨어집니다 😕 + + 2. 이제 `Replace voice: TTS to R.V.C.`로 이동하여 `enable` 상자를 확인하세요 ✅ 이후 각 TTS 스피커에 적용할 모델을 선택할 수 있습니다 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. 모든 R.V.C.에 적용할 F0 방법을 조정하세요. 🎛️ + + 4. 변경한 사항을 적용하려면 `APPLY CONFIGURATION`을 누르세요. 🔄 + + 5. 비디오 번역 탭으로 돌아가 'Translate'를 클릭하세요 ▶️ 이제 번역은 R.V.C.를 적용하여 수행됩니다. 🗣️ + + 팁: `Test R.V.C.`를 사용하여 실험하고 R.V.C.에 적용할 최상의 TTS 또는 구성을 찾을 수 있습니다. 🧪🔍 + + --- + + """, + "tab_translate": "비디오 번역", + "video_source": "비디오 소스 선택", + "link_label": "미디어 링크.", + "link_info": "예시: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL을 입력하세요...", + "dir_label": "비디오 경로.", + "dir_info": "예시: /usr/home/my_video.mp4", + "dir_ph": "경로를 입력하세요...", + "sl_label": "원본 언어", + "sl_info": "비디오의 원래 언어입니다", + "tat_label": "번역할 언어 선택", + "tat_info": "대상 언어를 선택하고 해당 언어에 대한 TTS도 선택하세요.", + "num_speakers": "비디오에서 몇 명이 말하고 있는지 선택하세요.", + "min_sk": "최소 스피커", + "max_sk": "최대 스피커", + "tts_select": "각 스피커에 원하는 음성 선택", + "sk1": "TTS 스피커 1", + "sk2": "TTS 스피커 2", + "sk3": "TTS 스피커 3", + "sk4": "TTS 스피커 4", + "sk5": "TTS 스피커 5", + "sk6": "TTS 스피커 6", + "sk7": "TTS 스피커 7", + "sk8": "TTS 스피커 8", + "sk9": "TTS 스피커 9", + "sk10": "TTS 스피커 10", + "sk11": "TTS 스피커 11", + "sk12": "TTS 스피커 12", + "vc_title": "다른 언어에서 음성 모방", + "vc_subtitle": """ + ### 여러 언어로 사람의 음성을 복제합니다. + 대부분의 경우 적절하게 사용되면 효과적이지만 모든 경우에 완벽한 결과를 보장하지는 않을 수 있습니다. + 음성 모방은 기본 스피커 TTS 모델에 의해 지배되는 악센트 및 감정을 제외한 참조 스피커의 음조만 복제합니다. + 이는 각 스피커의 주요 오디오에서 오디오 샘플을 가져와 처리합니다. + """, + "vc_active_label": "활성화된 음성 모방", + "vc_active_info": "활성화된 음성 모방: 원래 스피커의 음조를 복제합니다", + "vc_method_label": "방법", + "vc_method_info": "음성 모방 프로세스에 사용할 방법 선택", + "vc_segments_label": "최대 샘플 수", + "vc_segments_info": "최대 샘플 수: 프로세스에 생성될 오디오 샘플 수입니다. 더 많으면 더 좋지만 노이즈가 추가될 수 있습니다", + "vc_dereverb_label": "소음 제거", + "vc_dereverb_info": "소음 제거: 오디오 샘플에 음성 소음 제거를 적용합니다.", + "vc_remove_label": "이전 샘플 제거", + "vc_remove_info": "이전 샘플 제거: 생성된 이전 샘플을 제거하므로 새로 생성해야 합니다.", + "xtts_title": "오디오 기반 TTS 생성", + "xtts_subtitle": "음성을 포함한 최대 10초의 짧은 오디오 파일을 업로드하세요. XTTS를 사용하여 제공된 오디오 파일과 유사한 음성을 가진 새 TTS가 생성됩니다.", + "xtts_file_label": "음성을 포함한 짧은 오디오를 업로드하세요", + "xtts_name_label": "TTS에 대한 이름", + "xtts_name_info": "간단한 이름을 사용하세요", + "xtts_dereverb_label": "음성 소음 제거", + "xtts_dereverb_info": "음성 소음 제거: 오디오에 음성 소음 제거를 적용합니다", + "xtts_button": "오디오를 처리하고 TTS 선택기에 포함시킵니다", + "xtts_footer": "음성 xtts 자동 생성: 번역 생성 시 각 스피커에 대해 세그먼트를 자동으로 생성하려면 TTS 선택기에서 `_XTTS_/AUTOMATIC.wav`를 사용할 수 있습니다.", + "extra_setting": "고급 설정", + "acc_max_label": "최대 오디오 가속도", + "acc_max_info": "중첩을 피하기 위해 번역된 오디오 세그먼트에 대한 최대 가속도. 값이 1.0이면 가속도가 없음을 의미합니다", + "acc_rate_label": "가속도 조절", + "acc_rate_info": "가속도 조절: 속도가 느린 세그먼트에 대응하기 위해 가속도를 조절하여 연속성을 유지하고 다음 시작 시간을 고려합니다.", + "or_label": "중첩 감소", + "or_info": "중첩 감소: 이전 종료 시간을 기반으로 시작 시간을 조정하여 세그먼트가 겹치지 않도록 합니다. 동기화를 방해할 수 있습니다.", + "aud_mix_label": "오디오 혼합 방법", + "aud_mix_info": "원본 및 번역된 오디오 파일을 혼합하여 두 가지 사용 가능한 혼합 모드로 사용자 정의된 균형 잡힌 출력을 만듭니다.", + "vol_ori": "원본 오디오 볼륨", + "vol_tra": "번역된 오디오 볼륨", + "voiceless_tk_label": "음성 제거 트랙", + "voiceless_tk_info": "음성 제거 트랙: 번역된 오디오와 결합하기 전에 원본 오디오 음성을 제거합니다.", + "sub_type": "자막 유형", + "soft_subs_label": "부드러운 자막", + "soft_subs_info": "부드러운 자막: 시청자가 비디오를 시청하는 동안 켜고 끌 수 있는 선택적 자막.", + "burn_subs_label": "자막 불러오기", + "burn_subs_info": "자막 불러오기: 자막을 비디오에 임베드하여 시각 콘텐츠의 영구적인 부분으로 만듭니다.", + "whisper_title": "전사 구성.", + "lnum_label": "숫자를 문자로 변환", + "lnum_info": "숫자를 문자로 변환: 텍스트에서 숫자 표현을 해당되는 글자로 대체하십시오.", + "scle_label": "소리 정리", + "scle_info": "소리 정리: 음성을 향상시키고 타임 스탬프 정확도를 위해 전사하기 전에 배경 소음을 제거하십시오. 이 작업은 특히 긴 오디오 파일의 경우 시간이 걸릴 수 있습니다.", + "sd_limit_label": "세그먼트 기간 제한", + "sd_limit_info": "각 세그먼트의 최대 기간(초)을 지정하십시오. 오디오는 VAD를 사용하여 각 세그먼트 조각의 기간을 제한하여 처리됩니다.", + "asr_model_info": "기본적으로 '속삭임 모델'을 사용하여 구어를 텍스트로 변환합니다. 예를 들어, 중국어 언어 파인튜닝 모델을 사용하려면 드롭다운에 'BELLE-2/Belle-whisper-large-v3-zh' 저장소 이름을 입력하십시오. Hugging Face에서 파인튜닝된 모델을 찾을 수 있습니다.", + "ctype_label": "계산 유형", + "ctype_info": "int8 또는 float16과 같은 더 작은 유형을 선택하면 메모리 사용을 줄이고 계산 처리량을 증가시켜 성능을 향상시킬 수 있지만 float32와 같은 큰 데이터 유형에 비해 정밀성을 희생할 수 있습니다.", + "batchz_label": "일괄 크기", + "batchz_info": "일괄 크기를 줄이면 GPU의 VRAM이 적은 경우 메모리를 절약할 수 있으며 메모리 부족 문제를 관리하는 데 도움이됩니다.", + "tsscale_label": "텍스트 분할 규모", + "tsscale_info": "문장, 단어 또는 문자별로 텍스트를 세그먼트로 나눕니다. 단어 및 문자 분할은 자막에 유용한 더 세밀한 세분성을 제공합니다. 번역 비활성화는 원래 구조를 보존합니다.", + "srt_file_label": "SRT 자막 파일 업로드(전사 대신 사용됨)", + "divide_text_label": "다음에 따라 텍스트 세그먼트를 분할:", + "divide_text_info": "(실험적) 기존 텍스트 세그먼트를 분할하기 위해 구분 기호를 입력하세요. 도구는 발생한 사례를 식별하고 그에 따라 새 세그먼트를 생성합니다. |를 사용하여 여러 구분 기호를 지정하세요. 예: !|?|...|。", + "diarization_label": "다이어리제이션 모델", + "tr_process_label": "번역 프로세스", + "out_type_label": "출력 유형", + "out_name_label": "파일 이름", + "out_name_info": "출력 파일의 이름", + "task_sound_label": "작업 상태 사운드", + "task_sound_info": "작업 상태 사운드: 작업 완료 또는 실행 중 오류를 나타내는 사운드 알림을 재생합니다.", + "cache_label": "진행 상태 검색", + "cache_info": "진행 상태 검색: 마지막 체크포인트에서 프로세스를 계속합니다.", + "preview_info": "미리 보기는 테스트 목적으로 비디오를 10초로 자릅니다. 전체 비디오 지속 시간을 검색하려면 비활성화하세요.", + "edit_sub_label": "생성된 자막 편집", + "edit_sub_info": "생성된 자막을 편집할 수 있습니다: '자막 가져오기 및 편집' 버튼을 사용하여 먼저 자막을 가져와 편집한 후, '번역' 버튼을 사용하여 비디오를 생성할 수 있습니다", + "button_subs": "자막 가져오기 및 편집", + "editor_sub_label": "생성된 자막", + "editor_sub_info": "여기에서 생성된 자막의 텍스트를 자유롭게 편집할 수 있습니다. '번역할 언어', '번역할 언어' 및 '최대 스피커'를 제외한 인터페이스 옵션을 변경한 후 '번역' 버튼을 클릭하여 오류를 방지하세요. 작업을 마치면 '번역' 버튼을 클릭하세요.", + "editor_sub_ph": "먼저 '자막 가져오기 및 편집'를 눌러 자막을 가져옵니다", + "button_translate": "번역", + "output_result_label": "번역된 비디오 다운로드", + "sub_ori": "자막", + "sub_tra": "번역된 자막", + "ht_token_info": "중요한 단계 중 하나는 Pyannote 사용에 대한 라이선스 동의를 받는 것입니다. Hugging Face에서 계정을 가져야하며 다음 모델을 사용하기 위해 라이선스를 수락해야합니다: https://huggingface.co/pyannote/speaker-diarization 및 https://huggingface.co/pyannote/segmentation. 여기에서 키 토큰을 가져옵니다: https://hf.co/settings/tokens", + "ht_token_ph": "토큰을 입력하세요...", + "tab_docs": "문서 번역", + "docs_input_label": "문서 소스 선택", + "docs_input_info": "PDF, DOCX, TXT 또는 텍스트가 될 수 있습니다", + "docs_source_info": "텍스트의 원래 언어입니다", + "chunk_size_label": "TTS가 세그먼트 당 처리할 최대 문자 수", + "chunk_size_info": "값이 0이면 TTS에 대해 동적이고 더 호환 가능한 값이 할당됩니다.", + "docs_button": "언어 변환 브릿지 시작", + "cv_url_info": "URL에서 R.V.C. 모델을 자동으로 다운로드합니다. HuggingFace 또는 드라이브에서 링크를 사용할 수 있으며 각각을 쉼표로 구분하여 여러 링크를 포함할 수 있습니다. 예: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "음성 교체: TTS에서 R.V.C.로", + "sec1_title": "### 1. 사용하도록 설정하려면 활성화로 표시합니다.", + "enable_replace": "모델 사용을 활성화하려면 이를 확인합니다.", + "sec2_title": "### 2. 각 해당하는 스피커의 TTS에 적용할 음성을 선택하고 구성을 적용합니다.", + "sec2_subtitle": "사용할 수에 따라 각각에 해당하는 모델이 필요합니다. 추가적으로 스피커가 올바르게 감지되지 않은 경우 보조 모델이 있습니다.", + "cv_tts1": "스피커 1에 적용할 음성을 선택하세요.", + "cv_tts2": "스피커 2에 적용할 음성을 선택하세요.", + "cv_tts3": "스피커 3에 적용할 음성을 선택하세요.", + "cv_tts4": "스피커 4에 적용할 음성을 선택하세요.", + "cv_tts5": "스피커 5에 적용할 음성을 선택하세요.", + "cv_tts6": "스피커 6에 적용할 음성을 선택하세요.", + "cv_tts7": "스피커 7에 적용할 음성을 선택하세요.", + "cv_tts8": "스피커 8에 적용할 음성을 선택하세요.", + "cv_tts9": "스피커 9에 적용할 음성을 선택하세요.", + "cv_tts10": "스피커 10에 적용할 음성을 선택하세요.", + "cv_tts11": "스피커 11에 적용할 음성을 선택하세요.", + "cv_tts12": "스피커 12에 적용할 음성을 선택하세요.", + "cv_aux": "- 스피커가 올바르게 감지되지 않은 경우 적용할 음성.", + "cv_button_apply": "구성 적용", + "tab_help": "도움말", + }, + "marathi": { + "description": """ + ### 🎥 **आसानीसोबत SoniTranslate द्वारे व्हिडिओ अनुवाद करा!** 📽️ + + एक व्हिडिओ, ऑडिओ फाईल अपलोड करा किंवा एक YouTube लिंक प्रदान करा. 📽️ **अद्यतनित नोटबुक घ्या आधिकृत भंडारात।: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + तपशील देखण्यासाठी 'मदत' टॅब पहा. व्हिडिओ अनुवादासोबत मजा करण्याची सुरवात करूया! 🚀🎉 + """, + "tutorial": """ + # 🔰 **वापरायला निर्देशिका:** + + 1. 📤 **व्हिडिओ**, **ऑडिओ फाईल** अपलोड करा किंवा 🌐 **YouTube लिंक प्रदान करा.** + + 2. 🌍 व्हिडिओ **अनुवाद** करण्यासाठी कोणत्या **भाषेत निवडा.** + + 3. 🗣️ व्हिडिओमध्ये **किती लोक बोलत आहेत** ते स्पष्ट करा आणि प्रत्येकाला अनुवाद भाषेसाठी उपयुक्त TTS द्या. + + 4. 🚀 '**अनुवाद**' बटण दाबा निकाल मिळवण्यासाठी. + + --- + + # 🧩 **SoniTranslate विविध TTS (पाठ-टू-स्पीच) इंजिनसाठी समर्थन करते, ज्या म्हणजे:** + - EDGE-TTS → स्वरूप `en-AU-WilliamNeural-Male` → जलद आणि खात्रीशील. + - FACEBOOK MMS → स्वरूप `en-facebook-mms VITS` → ध्वनी अधिक प्राकृतिक आहे; ह्या क्षणी, हे केवळ CPU वापरते. + - PIPER TTS → स्वरूप `en_US-lessac-high VITS-onnx` → म्हणजे अखेरचा, परंतु ह्यात CPU आणि GPU दोन्हीत अनुकूलित केले आहे. + - BARK → स्वरूप `en_speaker_0-Male BARK` → चांगली गुणवत्ता परंतु मंद, आणि हे हल्ल्यांसाठी आदर्श आहे. + - OpenAI TTS → स्वरूप `>alloy OpenAI-TTS` → बहुभाषिक आहे पण OpenAI API की आवश्यकता आहे + - Coqui XTTS → स्वरूप `_XTTS_/AUTOMATIC.wav` → केवळ उपलब्ध आहे: चिनी (सरलीकृत), इंग्रजी, फ्रेंच, जर्मन, इटालियन, पोर्तुगीज, पोलिश, तुर्की, रशियन, डच, चेक, अरबी, स्पॅनिश, हंगेरियन, कोरियन आणि जपानी. + + --- + + # 🎤 कसे वापरावे आर.व्ही.सी. आणि आर.व्ही.सी.2 आवाज (पर्वतीय) 🎶 + + उद्दिष्ट म्हणजे उत्पन्न केलेल्या TTS (पाठ-टू-स्पीच) वर एक आर.व्ही.सी. लागू करा 🎙️ + + 1. `कस्टम व्हॉईस आर.व्ही.सी.` टॅबमध्ये, आपल्याला आवश्यक असलेल्या मॉडेल्स डाउनलोड करा 📥 आपण Hugging Face आणि Google Drive यांच्या लिंक्सचा वापर करू शकता, जसे की zip, pth किंवा इंडेक्स. आपण पूर्ण HF स्पेस भंडारांचा डाउनलोड करू शकता, परंतु ह्या पर्यायाचा स्थिरपणा काही कमी आहे 😕 + + 2. आता, `आर.व्ही.सी. च्या आवाजाच्या TTS ला बदला: टीटीएस ते आर.व्ही.सी.` आणि `सक्षम` बॉक्स तपासा ✅ यानंतर, आपण प्रत्येक TTS वक्त्याला लागणारा मॉडेल निवडू शकता 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. सर्व आर.व्ही.सी. ला लागू केलेला F0 विधान अनुकूलीत करा 🎛️ + + 4. आपल्याने केलेल्या बदल लागू करण्यासाठी `अनुप्रयोग बदल` दाबा 🔄 + + 5. व्हिडिओ अनुवाद टॅबमध्ये परत जा आणि 'अनुवाद' वर क्लिक करा ▶️ आता, अनुवाद R.V.C. लागू करत आहे 🗣️ + + सूचना: आपण `टेस्ट आर.व्ही.सी.` वापरू शकता व सर्वोत्तम TTS किंवा आर.व्ही.सी. लागू करण्यासाठी गुणवत्ता शोधण्यासाठी वापरू शकता 🧪🔍 + + --- + + """, + "tab_translate": "व्हिडिओ अनुवाद", + "video_source": "व्हिडिओ स्रोत निवडा", + "link_label": "मीडिया लिंक.", + "link_info": "उदाहरण: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL येथे जातो...", + "dir_label": "व्हिडिओ मार्ग.", + "dir_info": "उदाहरण: /usr/home/my_video.mp4", + "dir_ph": "मार्ग येथे जातो...", + "sl_label": "मूळ भाषा", + "sl_info": "हे व्हिडिओची मूळ भाषा आहे", + "tat_label": "ऑडिओ अनुवाद करा", + "tat_info": "लक्ष्य भाषा निवडा आणि त्या भाषेसाठी संबद्ध TTS निवडण्यास सुनिश्चित करा.", + "num_speakers": "व्हिडिओमध्ये किती लोक बोलत आहेत हे निवडा.", + "min_sk": "किमान बोलताही", + "max_sk": "किमान बोलताही", + "tts_select": "प्रत्येक वक्त्यासाठी आपल्याला कसा आवाज आवडतो ते निवडा.", + "sk1": "TTS वक्त्य 1", + "sk2": "TTS वक्त्य 2", + "sk3": "TTS वक्त्य 3", + "sk4": "TTS वक्त्य 4", + "sk5": "TTS वक्त्य 5", + "sk6": "TTS वक्त्य 6", + "sk7": "TTS वक्त्य 7", + "sk8": "TTS वक्त्य 8", + "sk9": "TTS वक्त्य 9", + "sk10": "TTS वक्त्य 10", + "sk11": "TTS वक्त्य 11", + "sk12": "TTS वक्त्य 12", + "vc_title": "विविध भाषांमध्ये आवाज नक्कल", + "vc_subtitle": """ + ### विविध भाषांमध्ये व्यक्तीचा आवाज पुनर्निर्मित करा. + अनुकूलप्रद केल्यास अधिकांश आवाजांसह अद्याप अव्याप्ती न मिळताना, प्रत्येक गोष्टीत उपयोगी आहे. आवाज पुनर्निर्मित केवळ संदर्भ वक्त्याच्या टोन अधिल्यास आहे, ज्याची मूळ वक्त्य TTS मॉडेल द्वारे नियंत्रित केली जाते आणि नक्कल करणारी नाही. या विधानाने एका प्रमुख व्हिडिओतील प्रत्येक वक्त्याचे ऑडियो संच घेऊन ते प्रक्रिया करेल. + """, + "vc_active_label": "सक्रिय आवाज नक्कल", + "vc_active_info": "सक्रिय आवाज नक्कल: मूळ वक्त्याचा आवाज पुनर्निर्मित करते", + "vc_method_label": "पद्धत", + "vc_method_info": "आवाज नक्कल प्रक्रियेसाठी एक पद्धत निवडा", + "vc_segments_label": "कमाल सॅम्पल्स", + "vc_segments_info": "कमाल सॅम्पल्स: प्रक्रियेसाठी ऑडियो सॅम्पल्सची संख्या आहे, अधिक चांगलं आहे परंतु ते आवाज जोडणारं करू शकतात", + "vc_dereverb_label": "Dereverb", + "vc_dereverb_info": "Dereverb: ऑडियो सॅम्पल्सवर ध्वनीक सांकेतिक दिवस लागू करते.", + "vc_remove_label": "आधीचे सॅम्पल्स काढा", + "vc_remove_info": "आधीचे सॅम्पल्स काढा: मागील सॅम्पल्स काढा: मागील सॅम्पल्स काढा, म्हणजे नवीन सामग्री करण्यासाठी त्या नवीन सॅम्पल्स बनवणे आवश्यक आहे.", + "xtts_title": "ऑडियोवर आधारित TTS तयार करा", + "xtts_subtitle": "आवाजासह 10 सेकंदांचा मोठा ऑडियो फाईल अपलोड करा. XTTS वापरून, दिलेल्या ऑडियो फाईलसोबत समान आवाजासह नवीन TTS तयार केला जाईल.", + "xtts_file_label": "आवाजासह एक क्षिप्र ऑडियो अपलोड करा", + "xtts_name_label": "TTS साठी नाव", + "xtts_name_info": "एक साधा नाव वापरा", + "xtts_dereverb_label": "ऑडियोवर ध्वनीक सांकेतिक दिवस लागू करा", + "xtts_dereverb_info": "ऑडियोवर ध्वनीक सांकेतिक दिवस लागू करा: ऑडियोवर ध्वनीक सांकेतिक दिवस लागू करते", + "xtts_button": "ऑडियो प्रक्रिया करा आणि त्यामध्ये समाविष्ट करा", + "xtts_footer": "स्वयंचली आवाज XTTS उत्पादित करा: आपण TTS निवडकासाठी `_XTTS_/AUTOMATIC.wav` वापरू शकता, प्रत्येक वक्त्यासाठी नवीन सेगमेंट उत्पन्न करण्यासाठी आणि अनुवाद वापरताना एकत्रित करण्यासाठी।", + "extra_setting": "उन्नत सेटिंग्ज", + "acc_max_label": "ऑडियो अधिकतम एक्सेलरेशन", + "acc_max_info": "ओव्हरलॅपिंग टाळण्यासाठी अनुवादित ऑडियो सेगमेंटसाठी अधिकतम एक्सेलरेशन. 1.0 ची एक मूल्य अधिकतम एक्सेलरेशन प्रतिनिधित्व करते", + "acc_rate_label": "वेगवर्धी दर व्यवस्थापन", + "acc_rate_info": "वेगवर्धी दर व्यवस्थापन: अल्प गतीचे आवश्यक असलेले क्षेत्र समायोजित करण्यासाठी वेगवर्धी व्यवस्थापन करते, सततता ठेवते आणि पुढील सुरुवातीचा वेळ मलान घेतला जातो.", + "or_label": "ओव्हरलॅप कमी करा", + "or_info": "ओव्हरलॅप कमी करा: मागील समाप्तीच्या वेळेस आधारित सुरुवातीच्या वेळा समायोजित करून सेगमेंट ओव्हरलॅप होण्यास रोखते; समकालिकरण अडचणी उत्पन्न करू शकतो.", + "aud_mix_label": "ऑडियो मिक्सिंग पद्धत", + "aud_mix_info": "स्वच्छ आणि संतुलित आउटपुट सादर करण्यासाठी मूळ आणि अनुवादित ऑडियो फाईल्स एकत्रित करण्यासाठी आवश्यक दोन मिक्सिंग मोड्युल्या सोडल्या आहेत.", + "vol_ori": "मूळ ऑडियोची व्हॉल्यूम", + "vol_tra": "अनुवादित ऑडियोची व्हॉल्यूम", + "voiceless_tk_label": "आवाजरहित ट्रॅक", + "voiceless_tk_info": "आवाजरहित ट्रॅक: अनुवादित ऑडियोसोबत संयुक्त करण्यापूर्वी मूळ ऑडियोची आवाजे काढा.", + "sub_type": "उपशीर्षक प्रकार", + "soft_subs_label": "कोमल सबटायटल्स", + "soft_subs_info": "कोमल सबटायटल्स: दर्शक व्हिडिओ पाहताना चालू निशांत करू शकतात किंवा बंद करू शकतात.", + "burn_subs_label": "सबटायटल्स जळवा", + "burn_subs_info": "सबटायटल्स जळवा: व्हिडिओमध्ये सबटायटल्स आजार करा, त्यांना दृश्यांतराचा कोणताही स्थायी भाग बनवून करा.", + "whisper_title": "वाचन विक्रमण संरचना.", + "lnum_label": "संख्या शब्दांतर", + "lnum_info": "संख्या शब्दांतर: अंकांचे प्रतिनिधित्व लेखित सर्वकाशांमध्ये बदला करा.", + "scle_label": "आवाज स्वच्छता", + "scle_info": "आवाज स्वच्छता: वादला तयार करण्यापूर्वी आवाज आणि बॅकग्राऊंड ध्वनी काढा. हे काम वेगवेगळ्या आवाज फाईल्ससह करता येऊ शकते.", + "sd_limit_label": "सेगमेंट अवधी सीमा", + "sd_limit_info": "प्रत्येक सेगमेंटसाठी कोणत्याही अवधीचा महासूचीत (सेकंदांमध्ये) सुनिश्चित करा. ऑडिओ वाडचा वापर करून प्रत्येक सेगमेंटच्या तुकड्याची अवधी सीमित करण्यात येईल.", + "asr_model_info": "जीवनाचा मूळ 'फिस्फिंग' मॉडेल वापरून बोललेली भाषा ते टेक्स्टमध्ये बदलते. उदाहरणार्थ, चीनी भाषेतील फायनट्यून्ड मॉडेल वापरण्यासाठी ड्रॉपडाऊनमध्ये 'BELLE-2/Belle-whisper-large-v3-zh' संग्रह नाव नोंदवा. Hugging Face वर फायनट्यून्ड मॉडेल्स शोधा.", + "ctype_label": "गणना प्रकार", + "ctype_info": "int8 किंवा float16 आढळवून कमी डेटा प्रकारांमध्ये निर्देशन करणे कामाची वेगवेगळी प्रदर्शन करू शकते आणि गणना द्वारे अपेक्षित क्षमतेची वाढवू शकते, परंतु float32 आणि इतर मोठ्या डेटा प्रकारांपेक्षा निश्चितता कुठल्या प्रकारे कमी करू शकते.", + "batchz_label": "बॅच आकार", + "batchz_info": "आपल्याला कमी VRAM असलेले GPU असल्यास बॅच आकार कमी करणे मेमरी झटका आणू शकते आणि मेमरी नसलेली समस्या व्यवस्थापित करण्यास मदत करू शकते.", + "tsscale_label": "टेक्स्ट सेगमेंटेशन पैमाना", + "tsscale_info": "पाठाचे सेगमेंट वाक्य, शब्द किंवा अक्षरांमध्ये वागवा. शब्द आणि अक्षर सेगमेंटेशन उपशीर्षकसाठी उपयुक्त तंत्रज्ञान उपलब्ध करून देतात; अनुवाद बंद करणे मूल संरचना संरक्षित करते.", + "srt_file_label": "एसआरटी उपशीर्षक फाईल अपलोड करा (व्हिस्परच्या विवेचनाच्या विरोधात वापरली जाईल)", + "divide_text_label": "टेक्स्ट सेगमेंट्स पुनर्विभाजित करा:", + "divide_text_info": "(प्रयोगशील) स्रोत भाषेतील विद्यमान टेक्स्ट सेगमेंट्सचा विभाग करण्यासाठी एक विभाजक प्रविष्ट करा. टूलला उपलब्धींना ओळखण्यासाठी आणि नुकसानकर्ता करण्यासाठी त्यामुळे नवीन सेगमेंट्स निर्मित करते. | चा वापर करून अनेक विभाजक स्पष्ट करा, उदा.: !|?|...|।", + "diarization_label": "डायरिझेशन मॉडेल", + "tr_process_label": "भाषांतर प्रक्रिया", + "out_type_label": "आउटपुट प्रकार", + "out_name_label": "फाईलचं नाव", + "out_name_info": "आउटपुट फाईलचं नाव", + "task_sound_label": "काम स्थिती आवाज", + "task_sound_info": "काम स्थिती आवाज: काम संपल्याचे किंवा क्रियाकलापातील त्रुटी दर्शवणारा ध्वन आवाज करा.", + "cache_label": "प्रगती पुनर्प्राप्त करा", + "cache_info": "प्रगती पुनर्प्राप्त करा: शेवटचा चेकपॉईंट येथून प्रक्रिया सुरू करा.", + "preview_info": "परीक्षणासाठी व्हिडिओला केवळ 10 सेकंदांसाठी कट्टा करते. कृपया पूर्ण व्हिडिओ अवधी प्राप्त करण्यासाठी हे निष्क्रिय करा.", + "edit_sub_label": "तयार केलेले उपशीर्षक संपादित करा", + "edit_sub_info": "तयार केलेले उपशीर्षक संपादित करा: अनुवाद करण्यासाठी 2 चरणांमध्ये अनुवाद चालवण्याची परवानगी देते. पहिल्यांदा 'उपशीर्षक मिळवा आणि संपादित करा' बटणावर क्लिक करून तुम्हाला उपशीर्षक मिळेल आणि त्या संपादित करण्यासाठी, आणि त्यानंतर 'अनुवाद' बटणावर क्लिक करून, आपण व्हिडिओ तयार करू शकता", + "button_subs": "उपशीर्षक मिळवा आणि संपादित करा", + "editor_sub_label": "तयार केलेले उपशीर्षक", + "editor_sub_info": "येथील तयार केलेल्या उपशीर्षकांमध्ये टेक्स्ट संपादित करण्यासाठी मनःपूर्वक वापरा. आपण 'अनुवाद' बटणावर क्लिक करण्यापूर्वी, संवादीचे निवडणे, 'मूळ भाषा', 'ऑडियोचे अनुवाद करा', आणि 'अधिक स्पीकर्स' या अनुक्रमात किंवा संरचना विकल्प बदलू शकता, त्यांचा अशा चुकांवर टाकण्यासाठी. एकदा तुम्ही संपू नेल, 'अनुवाद' बटणावर क्लिक करा.", + "editor_sub_ph": "प्रथम 'उपशीर्षक मिळवा आणि संपादित करा' बटणावर क्लिक करण्यात येतो", + "button_translate": "अनुवाद करा", + "output_result_label": "अनुवादित व्हिडिओ डाउनलोड करा", + "sub_ori": "उपशीर्षक", + "sub_tra": "अनुवादित उपशीर्षक", + "ht_token_info": "एक महत्त्वाचं कार्य म्हणजे Pyannote वापरासाठी लायसेंस समजून घेणे. आपल्याला Hugging Face वर एक खाते असावी लागते आणि मॉडेल्स वापरण्यासाठी लायसेंस स्वीकारा लागते: https://huggingface.co/pyannote/speaker-diarization आणि https://huggingface.co/pyannote/segmentation. आपल्याला येथे आपला की टोकन मिळेल: https://hf.co/settings/tokens", + "ht_token_ph": "टोकन येथे जाते...", + "tab_docs": "कागदपत्र अनुवाद", + "docs_input_label": "कागदपत्र स्रोत निवडा", + "docs_input_info": "ते पीडीएफ, डॉक्स, टीएक्सट किंवा मजकूर होऊ शकते", + "docs_source_info": "हे मजकूरची मूळ भाषा आहे", + "chunk_size_label": "प्रत्येक सेगमेंट प्रत्येक करकटाने TTS ला प्रक्रिया करण्यासाठी अधिकतम अक्षरांची संख्या", + "chunk_size_info": "0 चा मूल्य एक विनामूल्य आणि अधिक संगणकांसाठी संगणकात अधिक संगणकांसाठी अनुकूलित मूल्य नेमल्याची अर्थी होतो.", + "docs_button": "भाषा कन्वर्ट ब्रिज सुरू करा", + "cv_url_info": "यूआरएलपासून ऑटोमॅटिक रॉकी मॉडेल्स डाउनलोड करा. तुम्ही HuggingFace किंवा Drive ची लिंक वापरू शकता, आणि तुम्हाला किंवा तुम्हाला प्रत्येक लिंक, प्रत्येक लिंक समाविष्ट करण्यासाठी प्रत्येक लिंक वापरू शकता, प्रत्येक लिंक वापरू शकता. उदाहरण: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "आवाज बदला: TTS ते R.V.C.", + "sec1_title": "### 1. त्याचा वापर सक्षम करण्यासाठी, ते सक्षम जाहीर करा.", + "enable_replace": "मॉडेल्सचा वापर सक्षम करण्यासाठी हे तपासा.", + "sec2_title": "### 2. प्रत्येक TTS च्या प्रत्येक प्रतिनिधीत्व करण्यासाठी आवाज निवडा आणि सेटिंग्ज लागू करा.", + "sec2_subtitle": "आपण किती वापरणार आहात यानुसार, प्रत्येकाने स्वत: च्या मॉडेलची आवश्यकता आहे. अधिक केल्यासाठी, अधिक स्पेकरच्या उपयोगासाठी एक सहाय्यक असते जर कारणाने वक्ता सही रिकामे ओळखले जात नाहीत.", + "cv_tts1": "स्पीकर 1 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts2": "स्पीकर 2 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts3": "स्पीकर 3 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts4": "स्पीकर 4 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts5": "स्पीकर 5 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts6": "स्पीकर 6 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts7": "स्पीकर 7 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts8": "स्पीकर 8 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts9": "स्पीकर 9 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts10": "स्पीकर 10 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts11": "स्पीकर 11 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_tts12": "स्पीकर 12 साठी लागू करण्यासाठी आवाज निवडा.", + "cv_aux": "- जर कारणाने वक्ता सही ओळखले जात नाही तर लागू करण्यासाठी आवाज.", + "cv_button_apply": "सेटिंग्ज लागू करा", + "tab_help": "मदत", + }, + "azerbaijani": { + "description": """ + ### 🎥 **SoniTranslate ilə videoları asanlıqla tərcümə edin!** 📽️ + + Video, səs faylı yükləyin və ya YouTube bağlantısı təqdim edin. 📽️ **SoniTranslate-in rəsmi repositoriyasından yenilənmiş qeydləri alın: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + İstifadəsi üçün təlimatlar üçün `Kömək` sekmesinə baxın. Video tərcüməsi ilə əyləncəyə başlayaq! 🚀🎉 + """, + "tutorial": """ + # 🔰 **İstifadə təlimatları:** + + 1. 📤 **Video**, **səs faylı** yükləyin və ya 🌐 **YouTube bağlantısı** təqdim edin. + + 2. 🌍 **Videonu tərcümə etmək istədiyiniz dilə** seçin. + + 3. 🗣️ **Videoda danışan insanların sayını** göstərin və **hər birinə uyğun tərcümə dilində məsələlərin səsləndirilməsi üçün tələb edilən səsləndirməni təyin edin.** + + 4. 🚀 '**Tərcümə et**' düyməsini basın və nəticələri əldə edin. + + --- + + # 🧩 **SoniTranslate, fərqli TTS (Mətnə Səsləndirmə) mühərriklərini dəstəkləyir ki, onlar:** + - EDGE-TTS → format `en-AU-WilliamNeural-Male` → Sürətli və dəqiqdir. + - FACEBOOK MMS → format `en-facebook-mms VITS` → Səsi daha doğaldır; ancaq ancaq CPU istifadə edir. + - PIPER TTS → format `en_US-lessac-high VITS-onnx` → Əvvəlki ilə eynidir, ancaq hem CPU, hem də GPU üçün optimalaşdırılmışdır. + - BARK → format `en_speaker_0-Male BARK` → Yaxşı keyfiyyətli, ancaq yavaş və halüsinasiyalara meyllidir. + - OpenAI TTS → format `>alloy OpenAI-TTS` → Çoxdilli, lakin OpenAI API açarı tələb olunur + - Coqui XTTS → format `_XTTS_/AUTOMATIC.wav` → Yalnız Çin (Sadələşdirilmiş), İngilis, Fransız, Alman, İtalyan, Portuqal, Poliş, Türk, Rus, Holland, Çex, Ərəb, İspan, Macar, Korey və Yapon dilində mövcuddur. + + --- + + # 🎤 R.V.C. və R.V.C.2 Səsləri Necə İstifadə Etmək (İstəyə Bağlı) 🎶 + + Məqsəd, tərtib olunmuş TTS (Mətnə Səsləndirmə) -ə bir R.V.C. tətbiq etməkdir 🎙️ + + 1. `Xüsusi Səs R.V.C.` tabınızda ehtiyacınız olan modelləri yükləyin 📥 Hugging Face və Google Drive-da linklərdən, zip, pth və ya index formatlarında istifadə edə bilərsiniz. HF məkan repositoriyalarını da yükləyə bilərsiniz, lakin bu seçim çox sabit deyil 😕 + + 2. İndi, `Səsləndiriciyi əvəzlə: TTS to R.V.C.` -ni işarələyin və `aktivləşdirmək` qutusunu seçin ✅ Bundan sonra, istədiyiniz modelləri hər bir TTS speaker üçün tətbiq edə bilərsiniz 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Bütün R.V.C. -yə tətbiq olunacaq F0 metodunu tənzimləyin 🎛️ + + 4. Dəyişiklikləri tətbiq etmək üçün `KONFİQURASYANI TƏTİBİ ET` düyməsini basın 🔄 + + 5. Video tərcüməsi tabınıza qayıdın və 'Tərcümə et'ə klikləyin ▶️ Artıq tərcümə, R.V.C. tətbiq edilərək həyata keçirilir 🗣️ + + Məsləhət: R.V.C -ni təcrübə və ən yaxşı TTS və ya konfiqurasiyaları tapmaq üçün `Test R.V.C.` istifadə edə bilərsiniz 🧪🔍 + + --- + + """, + "tab_translate": "Video tərcüməsi", + "video_source": "Video mənbəyi seçin", + "link_label": "Mediya bağlantısı.", + "link_info": "Nümunə: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL buraya daxil olur...", + "dir_label": "Video Yolu.", + "dir_info": "Nümunə: /usr/home/my_video.mp4", + "dir_ph": "Yol buraya daxil olur...", + "sl_label": "Mənbə dil", + "sl_info": "Bu videoyun əsas dilidir", + "tat_label": "Audio tərcüməsi", + "tat_info": "Hədəf dil seçin və həmçinin o dil üçün uyğun olan TTS-i seçdiyinizdən əmin olun.", + "num_speakers": "Videoda danışan insanların sayını seçin.", + "min_sk": "Min speakerlər", + "max_sk": "Max speakerlər", + "tts_select": "Hər bir səsçiyə istədiyiniz səsi seçin.", + "sk1": "TTS Səsçi 1", + "sk2": "TTS Səsçi 2", + "sk3": "TTS Səsçi 3", + "sk4": "TTS Səsçi 4", + "sk5": "TTS Səsçi 5", + "sk6": "TTS Səsçi 6", + "sk7": "TTS Səsçi 7", + "sk8": "TTS Səsçi 8", + "sk9": "TTS Səsçi 9", + "sk10": "TTS Səsçi 10", + "sk11": "TTS Səsçi 11", + "sk12": "TTS Səsçi 12", + "vc_title": "Fərqli dillərdə Səs İmələsi", + "vc_subtitle": """ + ### Bir insanın səsini müxtəlif dillərdə çoğaldın. + Əksər səslər üçün effektiv olsa da, hər halda tam mükəmməlliyətə nail olmayabilir. + Səs imitasiyası sadəcə referans səsçinin tonunu çoxaldır, aksent və həssaslar, istifadə olunan əsas səsçi TTS modeli tərəfindən nəzarət olunur və çevirici tərəfindən çoğaldırılmır. + Bu, hər səsçi üçün əsas səs məlumatlarını alır və onları işləyir. + """, + "vc_active_label": "Fəal Səs İmələsi", + "vc_active_info": "Fəal Səs İmələsi: orijinal səsçinin tonunu çoğaldır", + "vc_method_label": "Metod", + "vc_method_info": "Səs İmələsi prosesində metod seçin", + "vc_segments_label": "Maksimum nümunələr", + "vc_segments_info": "Maksimum nümunələr: Proses üçün yaradılacaq səs nümunələrinin sayıdır, daha çoxu daha yaxşıdır, lakin gürültü əlavə edə bilər", + "vc_dereverb_label": "Dereverb", + "vc_dereverb_info": "Dereverb: Səs nümunələrinə vokal dereverb tətbiq edir.", + "vc_remove_label": "Əvvəlki nümunələri silin", + "vc_remove_info": "Əvvəlki nümunələri silin: Əvvəlki yaradılmış nümunələri silir, beləliklə yeni olanları yaratmaq lazımdır.", + "xtts_title": "Səsə əsaslanan bir TTS yaratın", + "xtts_subtitle": "Maksimum 10 saniyəlik bir səs faylı yükləyin. XTTS istifadə edərək, müvafiq səslə bir TTS yeni bir səs yaradılacaq.", + "xtts_file_label": "Səslə qısa bir səs yükləyin", + "xtts_name_label": "TTS üçün ad", + "xtts_name_info": "Sadə bir ad istifadə edin", + "xtts_dereverb_label": "Səsi dereverb edin", + "xtts_dereverb_info": "Səsi dereverb edin: Səsə vokal dereverb tətbiq edir", + "xtts_button": "Səsi proses edin və TTS seçiciyə daxil edin", + "xtts_footer": "Səs xtts-ini avtomatik olaraq yaradın: Tərcüməni yaratarkən hər səsçiyə avtomatik olaraq segmentlər yaratmaq üçün TTS seçicidə `_XTTS_/AUTOMATIC.wav` -dən istifadə edə bilərsiniz.", + "extra_setting": "Əlavə Ayarlar", + "acc_max_label": "Maksimum Audio sürəti", + "acc_max_info": "Üstünlük təşkil etməmək üçün tərcümə olunmuş audio segmentlərinin maksimum sürəti. 1.0 dəyəri heç bir sürəti təşkil etmir", + "acc_rate_label": "Sürətin Artımının Tənzimlənməsi", + "acc_rate_info": "Sürətin Artımının Tənzimlənməsi: Sürəti az olan segmentlərə uyğun olaraq sürəti tənzimləyir, davam etməni qoruyur və növbəti başlanğıcın vaxtını nəzərə alır.", + "or_label": "Üstünlüklərin Azaldılması", + "or_info": "Üstünlüklərin Azaldılması: Segmentlərin bir-birinin üstündə olmamasını təmin edir, əvvəlki bitiş vaxtlarına əsasən başlanğıc vaxtlarını tənzimləyərək; sinxronlaşmaya mane ola bilər.", + "aud_mix_label": "Audio qarışdırma metodları", + "aud_mix_info": "Orijinal və tərcümə olunmuş audio fayllarını qarışdıraraq iki mövcud qarışdırma rejimi ilə xüsusi, dengəli bir çıxış yaradın.", + "vol_ori": "Orijinal səsin səsi", + "vol_tra": "Tərcümə olunmuş audio səsi", + "voiceless_tk_label": "Səssiz Trekk", + "voiceless_tk_info": "Səssiz Trekk: Tərcümə olunmuş audio ilə birləşdirilmədən əvvəl orijinal audio səsini silin.", + "sub_type": "Subtitrlərin növü", + "soft_subs_label": "Yumuşaq Subtitrlər", + "soft_subs_info": "Yumuşaq Subtitrlər: İzləyicilərin videonu izləyərkən açıb bağlaya biləcəyi seçməlik subtitrlər.", + "burn_subs_label": "Altyazıları Yanma", + "burn_subs_info": "Altyazıları Yanma: Altyazıları videoya ilave edərək, onları görünən məzmunun daimi bir hissəsi halına gətirin.", + "whisper_title": "Tərcümə edilən mətnin konfiqurasiyası.", + "lnum_label": "Rəqəmləri Litarallarlaşdırmaq", + "lnum_info": "Rəqəmləri Litarallarlaşdırmaq: Sayısal təsvirləri onların yazılı müqabilələri ilə əvəzləyin.", + "scle_label": "Səs Təmizliyi", + "scle_info": "Səs Təmizliyi: Maksimum vaxt damğası dəqiqliyi üçün səsi yaxşılaşdırın, transkripsiyadan əvvəl fon gürültüsünü çıxarın. Bu əməliyyat uzun səs faylları ilə xüsusilə vaxt ala bilər.", + "sd_limit_label": "Segment Müddəti Məhdudiyyəti", + "sd_limit_info": "Hər bir segment üçün maksimum müddəti (saniyə) təyin edin. Səs VAD-dan istifadə edilərək hər bir segment parçasının müddəti məhdudlaşdırılacaq.", + "asr_model_info": "Bu, default olaraq danışılan dilə mətni 'Əfsus' modeli istifadə edərək mətnə çevirir. Xüsusi model istifadə edin, məsələn, çin dilində fayin-tuninq edilmiş model istifadə etmək üçün 'BELLE-2/Belle-whisper-large-v3-zh' depozit adını keçid menyusuna daxil edin. Hugging Face-də fayin-tuninq edilmiş modelləri tapın.", + "ctype_label": "Hesablama Növü", + "ctype_info": "int8 və ya float16 kimi kiçik növ seçmək yaddaş istifadəsini azaldaraq və hesablama nəzarətini artıraraq performansı yaxşılaşdıra bilər, lakin float32 kimi daha böyük veri növlərinə nisbətən dəqiqliyi fəda etmək olar.", + "batchz_label": "Toplu Ölçüsü", + "batchz_info": "Toplu ölçüsünü azaldaraq, əğer GPU-nuzun az VRAM varsa, yaddaş qənaət etmək mümkündür və Yaddaşsız Yaddaş problemə idarə edə bilər.", + "tsscale_label": "Mətn Segmentlərinin Masshtabı", + "tsscale_info": "Mətni cümlə, söz və ya simvollarla segmentlərə bölmək. Söz və simvol bölməsi, subtitrlər üçün faydalı olan daha dəqiqliyi təmin edir; tərcüməni söndürmək asal strukturu qoruyur.", + "srt_file_label": "Bir SRT subtitri faylı yükləyin (Fısıldağın transkripsiyası əvəzinə istifadə olunacaq)", + "divide_text_label": "Mətn segmentlərini bölmək üçün ayırıcı daxil edin:", + "divide_text_info": "(Təcrübəli) Mövcud mətn segmentlərini böləcək bir ayırıcı daxil edin. Alətlər tez-tez yaradır və uyğun gələn yerlərdə yeni segmentlər yaradır. Birdən çox ayırıcı daxil edin, |, misal: !|?|...|。", + "diarization_label": "Diyarizasiya Modeli", + "tr_process_label": "Tərcümə Prosesi", + "out_type_label": "Çıxış növü", + "out_name_label": "Fayl adı", + "out_name_info": "Çıxış faylının adı", + "task_sound_label": "Tapşırığın Vəziyyət Səsi", + "task_sound_info": "Tapşırığın Vəziyyət Səsi: Tapşırığın başa çatdığını və ya icra zamanı xətalıları göstərən səsli xəbərdarlıq səsi oxuyur.", + "cache_label": "İrəliyə Alma İşləmi", + "cache_info": "İrəliyə Alma İşləmi: Son yoxlama nöqtəsindən davam etmək.", + "preview_info": "Təcrübə məqsədi ilə videoyu yalnız 10 saniyəyə kəsir. Tam video müddətini əldə etmək üçün onu deaktiv edin.", + "edit_sub_label": "Yaradılan subtitrləri redaktə edin", + "edit_sub_info": "Yaradılan subtitrləri redaktə edin: Tərcüməni 2 addımlı olaraq başlatmaq üçün olan imkan. İlk olaraq 'SUBTITRİ AL VƏ REDAKTƏ ET' düyməsini basaraq subtitrləri alın, onları redaktə edin və sonra 'TƏRCÜMƏ ET' düyməsini basaraq video yarada bilərsiniz", + "button_subs": "SUBTITRİ AL VƏ REDAKTƏ ET", + "editor_sub_label": "Yaradılan subtitrlər", + "editor_sub_info": "Burada yaradılan subtitrlərdə mətni redaktə etmək azadır. Interfeys seçimlərini dəyişdirə bilərsiniz, lakin xəbərdarlıq olaraq 'Mənbə dil', 'Audio tərcüməsi' və 'Max speakerlər' üçün xətalara yol verməmək üçün, 'TƏRCÜMƏ ET' düyməsini basmadan əvvəl. Bitdikdən sonra, 'TƏRCÜMƏ ET' düyməsini basın.", + "editor_sub_ph": "İlk olaraq 'SUBTITRİ AL VƏ REDAKTƏ ET' düyməsini basın və subtitrləri alın", + "button_translate": "TƏRCÜMƏ ET", + "output_result_label": "TƏRCÜMƏ OLUNMUŞ VİDEOYU YÜKLƏYİN", + "sub_ori": "Subtitrlər", + "sub_tra": "Tərcümə olunmuş subtitrlər", + "ht_token_info": "Pyannote istifadəsi üçün lisenziya razılaşmasını qəbul etmək önəmli addımdır. Model istifadə etmək üçün Hugging Face-da hesabınız olmalı və modelləri istifadə etmək üçün lisenziya qəbul etməlisiniz: https://huggingface.co/pyannote/speaker-diarization və https://huggingface.co/pyannote/segmentation. Özüncə TOKENİNİZİ buradan əldə edin: https://hf.co/settings/tokens", + "ht_token_ph": "Token buraya daxil olur...", + "tab_docs": "Sənəd tərcüməsi", + "docs_input_label": "Sənəd mənbəyini seçin", + "docs_input_info": "PDF, DOCX, TXT və ya mətn ola bilər", + "docs_source_info": "Bu mətnin əsas dili", + "chunk_size_label": "TTS-in hər segmenti üçün təşkil olunan maksimum simvolların sayı", + "chunk_size_info": "0 dəyəri TTS üçün dinamik və daha uyğun bir dəyər təyin edir.", + "docs_button": "Dil Dəyişikliyi Köprüsünü Başlat", + "cv_url_info": "R.V.C. modellərini URL-dən avtomatik olaraq yükləyin. HuggingFace və Drive linklərindən istifadə edə bilərsiniz, və hər birini vergül ilə ayrılmış bir neçə link daxil edə bilərsiniz. Misal: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Səs əvəzləmə: TTS-dən R.V.C.-yə", + "sec1_title": "### 1. İstifadəsini aktivləşdirmək üçün onu aktiv edin.", + "enable_replace": "Bu modellərin istifadəsini aktivləşdirmək üçün bunu işarələyin.", + "sec2_title": "### 2. Hər bir uyğun səsçi TTS-ə tətbiq olunacaq səsi seçin və konfiqurasiyaları tətbiq edin.", + "sec2_subtitle": "Istifadə edəcəyiniz sayına bağlı olaraq, hər biri öz modellərinə ehtiyac duyar. Əlavə olaraq, səsçi doğru şəkildə aşkar edilmirsə yardımcı bir tətbiqi mövcuddur.", + "cv_tts1": "1-ci Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts2": "2-ci Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts3": "3-cü Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts4": "4-cü Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts5": "5-ci Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts6": "6-cı Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts7": "7-ci Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts8": "8-ci Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts9": "9-cu Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts10": "10-cu Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts11": "11-ci Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_tts12": "12-ci Səsçi üçün tətbiq olunacaq səsi seçin.", + "cv_aux": "- Səsçi doğru şəkildə aşkar edilmirsə tətbiq ediləcək səs.", + "cv_button_apply": "KONFiQURASiYANI TƏTBiQ EDiN", + "tab_help": "Kömək", + }, + + "persian": { + "description": """ + ### 🎥 **با SoniTranslate به راحتی ویدئوها را ترجمه کنید!** 📽️ + + یک ویدئو، فایل زیرنویس، فایل صوتی را آپلود کنید یا یک لینک ویدئوی URL ارائه دهید. 📽️ **دفترچه یادداشت به‌روز شده را از مخزن رسمی دریافت کنید: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + دستورالعمل‌های استفاده را در تب `Help` ببینید. بیایید با ترجمه ویدئوها سرگرم شویم! 🚀🎉 + """, + "tutorial": """ + # 🔰 **دستورالعمل استفاده:** + + 1. 📤 یک **ویدئو**، **فایل زیرنویس**، **فایل صوتی** را آپلود کنید یا 🌐 **لینک URL** به یک ویدئو مانند یوتیوب ارائه دهید. + + 2. 🌍 زبانی را که می‌خواهید **ویدئو را به آن ترجمه کنید** انتخاب کنید. + + 3. 🗣️ تعداد **افراد گوینده** در ویدئو را مشخص کنید و **برای هرکدام یک صدای متن به گفتار مناسب** برای زبان ترجمه انتخاب کنید. + + 4. 🚀 دکمه '**ترجمه**' را فشار دهید تا نتایج را دریافت کنید. + + --- + + # 🧩 **SoniTranslate از موتورهای مختلف TTS (متن به گفتار) پشتیبانی می‌کند، که شامل:** + - EDGE-TTS → فرمت `en-AU-WilliamNeural-Male` → سریع و دقیق. + - FACEBOOK MMS → فرمت `en-facebook-mms VITS` → صدای طبیعی‌تر؛ در حال حاضر فقط از CPU استفاده می‌کند. + - PIPER TTS → فرمت `en_US-lessac-high VITS-onnx` → مانند قبلی، اما برای CPU و GPU بهینه‌سازی شده است. + - BARK → فرمت `en_speaker_0-Male BARK` → کیفیت خوب ولی کند و مستعد هذیان. + - OpenAI TTS → فرمت `>alloy OpenAI-TTS` → چندزبانه اما نیاز به کلید API OpenAI دارد. + - Coqui XTTS → فرمت `_XTTS_/AUTOMATIC.wav` → فقط برای چینی (ساده‌شده)، انگلیسی، فرانسوی، آلمانی، ایتالیایی، پرتغالی، لهستانی، ترکی، روسی، هلندی، چک، عربی، اسپانیایی، مجارستانی، کره‌ای و ژاپنی در دسترس است. + + --- + + # 🎤 چگونه از صداهای R.V.C. و R.V.C.2 استفاده کنیم (اختیاری) 🎶 + + هدف اعمال R.V.C. به TTS تولید شده است 🎙️ + + 1. در تب `Custom Voice R.V.C.` مدل‌های مورد نیاز را دانلود کنید 📥 می‌توانید از لینک‌های Hugging Face و Google Drive در قالب‌های zip، pth، یا index استفاده کنید. همچنین می‌توانید مخازن کامل HF را دانلود کنید، اما این گزینه خیلی پایدار نیست 😕 + + 2. حالا به `Replace voice: TTS to R.V.C.` بروید و جعبه `enable` را تیک بزنید ✅ پس از این، می‌توانید مدل‌هایی را که می‌خواهید به هر سخنگوی TTS اعمال کنید انتخاب کنید 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. روش F0 که برای همه R.V.C. اعمال خواهد شد تنظیم کنید 🎛️ + + 4. دکمه `APPLY CONFIGURATION` را فشار دهید تا تغییرات اعمال شود 🔄 + + 5. به تب ترجمه ویدئو بازگردید و بر روی 'Translate' کلیک کنید ▶️ حالا ترجمه با اعمال R.V.C. انجام خواهد شد 🗣️ + + نکته: می‌توانید از `Test R.V.C.` استفاده کنید تا بهترین TTS یا تنظیمات را برای اعمال به R.V.C. آزمایش و پیدا کنید 🧪🔍 + + --- + + """, + "tab_translate": "ترجمه ویدئو", + "video_source": "منبع ویدئو را انتخاب کنید", + "link_label": "لینک رسانه.", + "link_info": "مثال: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "لینک URL را اینجا وارد کنید...", + "dir_label": "مسیر ویدئو.", + "dir_info": "مثال: /usr/home/my_video.mp4", + "dir_ph": "مسیر را اینجا وارد کنید...", + "sl_label": "زبان مبدا", + "sl_info": "این زبان اصلی ویدئو است", + "tat_label": "ترجمه صوتی به", + "tat_info": "زبان مقصد را انتخاب کنید و همچنین مطمئن شوید که TTS مربوط به آن زبان را انتخاب کنید.", + "num_speakers": "تعداد افراد گوینده در ویدئو را انتخاب کنید.", + "min_sk": "حداقل گوینده‌ها", + "max_sk": "حداکثر گوینده‌ها", + "tts_select": "صدای مورد نظر برای هر گوینده را انتخاب کنید.", + "sk1": "گوینده TTS 1", + "sk2": "گوینده TTS 2", + "sk3": "گوینده TTS 3", + "sk4": "گوینده TTS 4", + "sk5": "گوینده TTS 5", + "sk6": "گوینده TTS 6", + "sk7": "گوینده TTS 7", + "sk8": "گوینده TTS 8", + "sk9": "گوینده TTS 9", + "sk10": "گوینده TTS 10", + "sk11": "گوینده TTS 11", + "sk12": "گوینده TTS 12", + "vc_title": "تقلید صدا در زبان‌های مختلف", + "vc_subtitle": """ + ### صدای یک فرد را در زبان‌های مختلف بازتولید کنید. + در حالی که با اکثر صداها به درستی کار می‌کند، ممکن است در هر مورد به صورت کامل عمل نکند. + تقلید صدا تنها لحن گوینده مرجع را بازتولید می‌کند، بدون لهجه و احساسات که توسط مدل پایه TTS تعیین می‌شوند و توسط مبدل بازتولید نمی‌شوند. + این کار نمونه‌های صوتی را از صدای اصلی هر گوینده گرفته و پردازش می‌کند. + """, + "vc_active_label": "تقلید صدا فعال است", + "vc_active_info": "تقلید صدا فعال: لحن گوینده اصلی را بازتولید می‌کند", + "vc_method_label": "روش", + "vc_method_info": "یک روش برای فرآیند تقلید صدا انتخاب کنید", + "vc_segments_label": "حداکثر نمونه‌ها", + "vc_segments_info": "حداکثر نمونه‌ها: تعداد نمونه‌های صوتی که برای فرآیند تولید خواهند شد، بیشتر بهتر است اما ممکن است نویز اضافه کند", + "vc_dereverb_label": "حذف اکو", + "vc_dereverb_info": "حذف اکو: حذف اکو صوتی از نمونه‌های صوتی.", + "vc_remove_label": "حذف نمونه‌های قبلی", + "vc_remove_info": "حذف نمونه‌های قبلی: حذف نمونه‌های قبلی تولید شده، بنابراین نمونه‌های جدید نیاز به تولید دارند.", + "xtts_title": "ایجاد TTS بر اساس یک فایل صوتی", + "xtts_subtitle": "یک فایل صوتی کوتاه با صدای حداکثر 10 ثانیه آپلود کنید. با استفاده از XTTS، یک TTS جدید با صدای مشابه به فایل صوتی ارائه شده ایجاد خواهد شد.", + "xtts_file_label": "یک فایل صوتی کوتاه با صدا آپلود کنید", + "xtts_name_label": "نام برای TTS", + "xtts_name_info": "یک نام ساده استفاده کنید", + "xtts_dereverb_label": "حذف اکو صوتی", + "xtts_dereverb_info": "حذف اکو صوتی: حذف اکو از صوت", + "xtts_button": "پردازش صوت و افزودن آن به انتخابگر TTS", + "xtts_footer": "تولید صدای XTTS به طور خودکار: می‌توانید از `_XTTS_/AUTOMATIC.wav` در انتخابگر TTS برای تولید خودکار بخش‌ها برای هر گوینده هنگام تولید ترجمه استفاده کنید.", + "extra_setting": "تنظیمات پیشرفته", + "acc_max_label": "حداکثر شتاب صوتی", + "acc_max_info": "حداکثر شتاب برای بخش‌های صوتی ترجمه شده برای جلوگیری از تداخل. مقدار 1.0 نمایانگر بدون شتاب است", + "acc_rate_label": "تنظیم نرخ شتاب", + "acc_rate_info": "تنظیم نرخ شتاب: تنظیم شتاب برای سازگاری با بخش‌هایی که نیاز به سرعت کمتری دارند، حفظ پیوستگی و در نظر گرفتن زمان شروع بعدی.", + "or_label": "کاهش تداخل", + "or_info": "کاهش تداخل: اطمینان از عدم تداخل بخش‌ها با تنظیم زمان شروع بر اساس زمان پایان قبلی؛ ممکن است همگام‌سازی را مختل کند.", + "aud_mix_label": "روش ترکیب صوتی", + "aud_mix_info": "میکس فایل‌های صوتی اصلی و ترجمه شده برای ایجاد خروجی سفارشی و متعادل با دو حالت میکس موجود.", + "vol_ori": "حجم صدای اصلی", + "vol_tra": "حجم صدای ترجمه شده", + "voiceless_tk_label": "مسیر بدون صدا", + "voiceless_tk_info": "مسیر بدون صدا: حذف صدای اصلی قبل از ترکیب آن با صدای ترجمه شده.", + "sub_type": "نوع زیرنویس", + "soft_subs_label": "زیرنویس نرم", + "soft_subs_info": "زیرنویس نرم: زیرنویس‌های اختیاری که بینندگان می‌توانند آنها را هنگام تماشا روشن یا خاموش کنند.", + "burn_subs_label": "زیرنویس سوخته", + "burn_subs_info": "زیرنویس سوخته: تعبیه زیرنویس‌ها در ویدئو، که آنها را به بخشی دائمی از محتوای بصری تبدیل می‌کند.", + "whisper_title": "پیکربندی رونوشت.", + "lnum_label": "نوشتاری اعداد", + "lnum_info": "نوشتاری اعداد: جایگزین نمایش عددی با معادل‌های نوشتاری آنها در رونوشت.", + "scle_label": "پاکسازی صدا", + "scle_info": "پاکسازی صدا: تقویت صداها، حذف نویز پس‌زمینه قبل از رونوشت برای دقت زمان‌بندی بالا. این عملیات ممکن است زمان ببرد، به ویژه با فایل‌های صوتی طولانی.", + "sd_limit_label": "حداکثر مدت زمان بخش", + "sd_limit_info": "حداکثر مدت زمان برای هر بخش را مشخص کنید. صوت با استفاده از VAD پردازش خواهد شد، و مدت زمان برای هر بخش محدود خواهد شد.", + "asr_model_info": "این مدل زبان گفتاری را به متن تبدیل می‌کند و از مدل 'Whisper' به‌صورت پیش‌فرض استفاده می‌کند. از یک مدل سفارشی استفاده کنید، برای مثال، با وارد کردن نام مخزن 'BELLE-2/Belle-whisper-large-v3-zh' در لیست کشویی برای استفاده از مدل چینی فاین‌تیون شده. مدل‌های فاین‌تیون شده را در Hugging Face پیدا کنید.", + "ctype_label": "نوع محاسبه", + "ctype_info": "انتخاب انواع کوچکتر مانند int8 یا float16 می‌تواند عملکرد را با کاهش استفاده از حافظه و افزایش توان محاسباتی بهبود بخشد، اما ممکن است دقت را نسبت به انواع داده‌های بزرگ‌تر مانند float32 فدا کند.", + "batchz_label": "اندازه دسته", + "batchz_info": "کاهش اندازه دسته حافظه را ذخیره می‌کند اگر GPU شما VRAM کمتری دارد و کمک می‌کند به مدیریت مشکلات کمبود حافظه.", + "tsscale_label": "مقیاس بخش‌بندی متن", + "tsscale_info": "تقسیم متن به بخش‌ها با جملات، کلمات، یا کاراکترها. بخش‌بندی کلمه و کاراکتر دانه‌بندی بیشتری ارائه می‌دهد که برای زیرنویس‌ها مفید است؛ غیرفعال کردن ترجمه ساختار اصلی را حفظ می‌کند.", + "srt_file_label": "یک فایل زیرنویس SRT آپلود کنید (به جای رونوشت Whisper استفاده خواهد شد)", + "divide_text_label": "تقسیم مجدد بخش‌های متن توسط:", + "divide_text_info": "(آزمایشی) یک جداکننده برای تقسیم بخش‌های موجود متن در زبان منبع وارد کنید. ابزار وقوع‌ها را شناسایی کرده و بخش‌های جدید را بر اساس آن ایجاد می‌کند. چندین جداکننده را با | مشخص کنید، به عنوان مثال: !|?|...|。", + "diarization_label": "مدل دیاریزیشن", + "tr_process_label": "فرآیند ترجمه", + "out_type_label": "نوع خروجی", + "out_name_label": "نام فایل", + "out_name_info": "نام فایل خروجی", + "task_sound_label": "صدای وضعیت کار", + "task_sound_info": "صدای وضعیت کار: پخش صدای هشدار نشان‌دهنده تکمیل کار یا خطاها در حین اجرا.", + "cache_label": "بازیابی پیشرفت", + "cache_info": "بازیابی پیشرفت: ادامه فرآیند از آخرین نقطه توقف.", + "preview_info": "پیش‌نمایش ویدئو را به 10 ثانیه برای آزمایش برش می‌دهد. لطفاً آن را غیرفعال کنید تا ویدئوی کامل را دریافت کنید.", + "edit_sub_label": "ویرایش زیرنویس‌های تولید شده", + "edit_sub_info": "ویرایش زیرنویس‌های تولید شده: به شما امکان می‌دهد ترجمه را در دو مرحله انجام دهید. ابتدا با دکمه 'GET SUBTITLES AND EDIT' زیرنویس‌ها را بگیرید و ویرایش کنید، و سپس با دکمه 'TRANSLATE' ویدئو را تولید کنید", + "button_subs": "GET SUBTITLES AND EDIT", + "editor_sub_label": "زیرنویس‌های تولید شده", + "editor_sub_info": "می‌توانید متن زیرنویس‌های تولید شده را اینجا ویرایش کنید. قبل از کلیک بر روی دکمه 'TRANSLATE' می‌توانید تغییرات را در گزینه‌های رابط ایجاد کنید، به جز 'زبان منبع'، 'ترجمه صوتی به' و 'حداکثر گوینده‌ها'، تا از بروز خطاها جلوگیری شود. پس از اتمام، دکمه 'TRANSLATE' را فشار دهید.", + "editor_sub_ph": "ابتدا دکمه 'GET SUBTITLES AND EDIT' را فشار دهید تا زیرنویس‌ها را دریافت کنید", + "button_translate": "TRANSLATE", + "output_result_label": "دانلود ویدئوی ترجمه شده", + "sub_ori": "زیرنویس‌ها", + "sub_tra": "زیرنویس‌های ترجمه شده", + "ht_token_info": "یکی از مراحل مهم قبول موافقتنامه مجوز برای استفاده از Pyannote است. شما نیاز به داشتن یک حساب کاربری در Hugging Face و قبول مجوز برای استفاده از مدل‌ها دارید: https://huggingface.co/pyannote/speaker-diarization و https://huggingface.co/pyannote/segmentation. کلید TOKEN خود را اینجا بگیرید: https://hf.co/settings/tokens", + "ht_token_ph": "کلید TOKEN را اینجا وارد کنید...", + "tab_docs": "ترجمه اسناد", + "docs_input_label": "منبع سند را انتخاب کنید", + "docs_input_info": "می‌تواند PDF، DOCX، TXT، یا متن باشد", + "docs_source_info": "این زبان اصلی متن است", + "chunk_size_label": "حداکثر تعداد کاراکترهایی که TTS در هر بخش پردازش خواهد کرد", + "chunk_size_info": "مقدار 0 یک مقدار پویا و سازگارتر برای TTS اختصاص می‌دهد.", + "docs_button": "شروع پل تبدیل زبان", + "cv_url_info": "مدل‌های R.V.C. را به صورت خودکار از URL دانلود کنید. می‌توانید از لینک‌های HuggingFace یا Drive استفاده کنید و می‌توانید چندین لینک را شامل کنید، هرکدام با کاما جدا شده باشند. مثال: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "تعویض صدا: TTS به R.V.C.", + "sec1_title": "### 1. برای فعال‌سازی استفاده، آن را به عنوان فعال علامت بزنید.", + "enable_replace": "این را بررسی کنید تا استفاده از مدل‌ها فعال شود.", + "sec2_title": "### 2. صدایی را که به هر TTS هر گوینده اعمال خواهد شد انتخاب کنید و تنظیمات را اعمال کنید.", + "sec2_subtitle": "بسته به تعداد <گوینده TTS> که استفاده می‌کنید، هرکدام به مدل مربوطه خود نیاز دارند. علاوه بر این، یک مدل کمکی نیز وجود دارد که در صورت عدم تشخیص صحیح گوینده استفاده می‌شود.", + "cv_tts1": "صدایی را برای گوینده 1 انتخاب کنید.", + "cv_tts2": "صدایی را برای گوینده 2 انتخاب کنید.", + "cv_tts3": "صدایی را برای گوینده 3 انتخاب کنید.", + "cv_tts4": "صدایی را برای گوینده 4 انتخاب کنید.", + "cv_tts5": "صدایی را برای گوینده 5 انتخاب کنید.", + "cv_tts6": "صدایی را برای گوینده 6 انتخاب کنید.", + "cv_tts7": "صدایی را برای گوینده 7 انتخاب کنید.", + "cv_tts8": "صدایی را برای گوینده 8 انتخاب کنید.", + "cv_tts9": "صدایی را برای گوینده 9 انتخاب کنید.", + "cv_tts10": "صدایی را برای گوینده 10 انتخاب کنید.", + "cv_tts11": "صدایی را برای گوینده 11 انتخاب کنید.", + "cv_tts12": "صدایی را برای گوینده 12 انتخاب کنید.", + "cv_aux": "- صدایی که در صورت عدم تشخیص موفقیت‌آمیز گوینده اعمال خواهد شد.", + "cv_button_apply": "اعمال تنظیمات", + "tab_help": "کمک", + }, + + "afrikaans": { + "description": """ + ### 🎥 **Vertaal video's maklik met SoniTranslate!** 📽️ + + Laai 'n video, onderskrif, klanklêer op of verskaf 'n URL-videolink. 📽️ **Kry die opgedateerde notaboek van die amptelike repository: [SoniTranslate](https://github.com/R3gm/SoniTranslate)!** + + Sien die tab 'Hulp' vir instruksies oor hoe om dit te gebruik. Kom ons begin pret hê met videovertaal! 🚀🎉 + """, + "tutorial": """ + # 🔰 **Instruksies vir gebruik:** + + 1. 📤 Laai 'n **video**, **onderskriflêer**, **klanklêer** op of verskaf 'n 🌐 **URL link** na 'n video soos YouTube. + + 2. 🌍 Kies die taal waarin jy die **video wil vertaal**. + + 3. 🗣️ Spesifiseer die **aantal mense wat praat** in die video en **ken elkeen 'n teks-na-spraak-stem toe** wat geskik is vir die vertalingstaal. + + 4. 🚀 Druk die '**Vertaal**' knoppie om die resultate te verkry. + + --- + + # 🧩 **SoniTranslate ondersteun verskillende TTS (Teks-na-Spraak) enjins, wat is:** + - EDGE-TTS → formaat `en-AU-WilliamNeural-Male` → Vinnig en akkuraat. + - FACEBOOK MMS → formaat `en-facebook-mms VITS` → Die stem is meer natuurlik; op die oomblik gebruik dit net CPU. + - PIPER TTS → formaat `en_US-lessac-high VITS-onnx` → Dieselfde as die vorige een, maar dit is geoptimaliseer vir beide CPU en GPU. + - BARK → formaat `en_speaker_0-Male BARK` → Goeie kwaliteit maar stadig, en dit is geneig tot hallusinasies. + - OpenAI TTS → formaat `>alloy OpenAI-TTS` → Veeltalig maar dit benodig 'n OpenAI API sleutel. + - Coqui XTTS → formaat `_XTTS_/AUTOMATIC.wav` → Slegs beskikbaar vir Vereenvoudigde Chinees, Engels, Frans, Duits, Italiaans, Portugees, Pools, Turks, Russies, Nederlands, Tsjeggies, Arabies, Spaans, Hongaars, Koreaans en Japanees. + + --- + + # 🎤 Hoe om R.V.C. en R.V.C.2 Stemmen te Gebruik (Opsioneel) 🎶 + + Die doel is om 'n R.V.C. toe te pas op die gegenereerde TTS (Teks-na-Spraak) 🎙️ + + 1. In die `Aangepaste Stem R.V.C.` tab, laai die modelle af wat jy benodig 📥 Jy kan skakels van Hugging Face en Google Drive in formate soos zip, pth, of index gebruik. Jy kan ook volledige HF-ruimte-repositories aflaai, maar hierdie opsie is nie baie stabiel nie 😕 + + 2. Gaan nou na `Vervang stem: TTS na R.V.C.` en merk die `aktiveer` boks ✅ Na dit, kan jy die modelle kies wat jy wil toepas op elke TTS spreker 👩‍🦰👨‍🦱👩‍🦳👨‍🦲 + + 3. Pas die F0 metode aan wat toegepas sal word op alle R.V.C. 🎛️ + + 4. Druk `PAS KONFIGURASIE TOE` om die veranderinge wat jy gemaak het toe te pas 🔄 + + 5. Gaan terug na die videovertaal tab en klik op 'Vertaal' ▶️ Nou sal die vertaling gedoen word met die toepassing van die R.V.C. 🗣️ + + Wenke: Jy kan `Toets R.V.C.` gebruik om te eksperimenteer en die beste TTS of konfigurasies te vind om op die R.V.C. toe te pas 🧪🔍 + + --- + + """, + "tab_translate": "Videovertaal", + "video_source": "Kies Video Bron", + "link_label": "Media link.", + "link_info": "Voorbeeld: www.youtube.com/watch?v=g_9rPvbENUw", + "link_ph": "URL gaan hier...", + "dir_label": "Video Pad.", + "dir_info": "Voorbeeld: /usr/home/my_video.mp4", + "dir_ph": "Pad gaan hier...", + "sl_label": "Bron taal", + "sl_info": "Dit is die oorspronklike taal van die video", + "tat_label": "Vertaal klank na", + "tat_info": "Kies die teikentaal en maak ook seker om die ooreenstemmende TTS vir daardie taal te kies.", + "num_speakers": "Kies hoeveel mense praat in die video.", + "min_sk": "Min sprekers", + "max_sk": "Max sprekers", + "tts_select": "Kies die stem wat jy vir elke spreker wil hê.", + "sk1": "TTS Spreker 1", + "sk2": "TTS Spreker 2", + "sk3": "TTS Spreker 3", + "sk4": "TTS Spreker 4", + "sk5": "TTS Spreker 5", + "sk6": "TTS Spreker 6", + "sk7": "TTS Spreker 7", + "sk8": "TTS Spreker 8", + "sk9": "TTS Spreker 9", + "sk10": "TTS Spreker 10", + "sk11": "TTS Spreker 11", + "sk12": "TTS Spreker 12", + "vc_title": "Stem Nabootsing in Verskillende Tale", + "vc_subtitle": """ + ### Herhaal 'n persoon se stem oor verskeie tale. + Terwyl effektief met die meeste stemme wanneer gepas gebruik, mag dit nie perfek wees in elke geval nie. + Stem Nabootsing herhaal slegs die verwysingspreker se toon, sonder aksent en emosie, wat deur die basispreker TTS model beheer word en nie deur die omskakelaar nageboots word nie. + Dit sal oudio monsters van die hoof oudio neem vir elke spreker en hulle verwerk. + """, + "vc_active_label": "Aktiewe Stem Nabootsing", + "vc_active_info": "Aktiewe Stem Nabootsing: Herhaal die oorspronklike spreker se toon", + "vc_method_label": "Metode", + "vc_method_info": "Kies 'n metode vir die Stem Nabootsing proses", + "vc_segments_label": "Max monsters", + "vc_segments_info": "Max monsters: Is die aantal oudio monsters wat gegenereer sal word vir die proses, meer is beter maar dit kan geraas byvoeg", + "vc_dereverb_label": "Dereverb", + "vc_dereverb_info": "Dereverb: Pas vokale dereverb toe op die oudio monsters.", + "vc_remove_label": "Verwyder vorige monsters", + "vc_remove_info": "Verwyder vorige monsters: Verwyder die vorige monsters wat gegenereer is, sodat nuwe monsters geskep moet word.", + "xtts_title": "Skep 'n TTS gebaseer op 'n oudio", + "xtts_subtitle": "Laai 'n oudio lêer van maksimum 10 sekondes op met 'n stem. Deur XTTS te gebruik, sal 'n nuwe TTS geskep word met 'n stem soortgelyk aan die verskafde oudio lêer.", + "xtts_file_label": "Laai 'n kort oudio op met die stem", + "xtts_name_label": "Naam vir die TTS", + "xtts_name_info": "Gebruik 'n eenvoudige naam", + "xtts_dereverb_label": "Dereverb oudio", + "xtts_dereverb_info": "Dereverb oudio: Pas vokale dereverb toe op die oudio", + "xtts_button": "Verwerk die oudio en sluit dit in die TTS keurder in", + "xtts_footer": "Genereer stem xtts outomaties: Jy kan `_XTTS_/AUTOMATIC.wav` gebruik in die TTS keurder om outomaties segmente te genereer vir elke spreker wanneer die vertaling gegenereer word.", + "extra_setting": "Gevorderde Instellings", + "acc_max_label": "Max Oudio versnelling", + "acc_max_info": "Maksimum versnelling vir vertaalde oudio segmente om oorvleueling te vermy. 'n Waarde van 1.0 verteenwoordig geen versnelling nie", + "acc_rate_label": "Versnelling Reguleringskoers", + "acc_rate_info": "Versnelling Reguleringskoers: Pas versnelling aan om segmente wat minder spoed benodig te akkommodeer, handhaaf kontinuïteit en oorweeg volgende-begin tydsberekening.", + "or_label": "Oorvleueling Reduksie", + "or_info": "Oorvleueling Reduksie: Verseker segmente oorvleuel nie deur begin tye aan te pas gebaseer op vorige eind tye; kan sinkronisasie versteur.", + "aud_mix_label": "Oudio Meng Metode", + "aud_mix_info": "Meng oorspronklike en vertaalde oudio lêers om 'n aangepaste, gebalanseerde uitset te skep met twee beskikbare mengmodusse.", + "vol_ori": "Volume oorspronklike oudio", + "vol_tra": "Volume vertaalde oudio", + "voiceless_tk_label": "Stemlose Snit", + "voiceless_tk_info": "Stemlose Snit: Verwyder die oorspronklike oudio stemme voordat dit met die vertaalde oudio gekombineer word.", + "sub_type": "Onderskrif tipe", + "soft_subs_label": "Sagte Onderskrifte", + "soft_subs_info": "Sagte Onderskrifte: Opsionele onderskrifte wat kykers kan aanskakel of afskakel terwyl hulle die video kyk.", + "burn_subs_label": "Brand Onderskrifte", + "burn_subs_info": "Brand Onderskrifte: Inbed onderskrifte in die video, maak hulle 'n permanente deel van die visuele inhoud.", + "whisper_title": "Konfigureer transkripsie.", + "lnum_label": "Literaliseer Nommer", + "lnum_info": "Literaliseer Nommer: Vervang numeriese verteenwoordigings met hul geskrewe ekwivalente in die transkripsie.", + "scle_label": "Klank Opruiming", + "scle_info": "Klank Opruiming: Versterk vokale, verwyder agtergrondgeraas voor transkripsie vir uiterste tydstempel presisie. Hierdie operasie kan tyd neem, veral met lang oudio lêers.", + "sd_limit_label": "Segmentduur Beperking", + "sd_limit_info": "Spesifiseer die maksimum duur (in sekondes) vir elke segment. Die oudio sal verwerk word met VAD, wat die duur vir elke segment stuk beperk.", + "asr_model_info": "Dit omskakel gesproke taal na teks met die 'Whisper model' by verstek. Gebruik 'n aangepaste model, byvoorbeeld, deur die repository naam 'BELLE-2/Belle-whisper-large-v3-zh' in die dropdown in te voer om 'n Chinees taal fyn-afgestelde model te gebruik. Vind fyn-afgestelde modelle op Hugging Face.", + "ctype_label": "Reken tipe", + "ctype_info": "Kies kleiner tipes soos int8 of float16 kan prestasie verbeter deur geheuegebruik te verminder en berekeningstempo te verhoog, maar kan presisie opoffer in vergelyking met groter datatipes soos float32.", + "batchz_label": "Batch grootte", + "batchz_info": "Verkleining van die batch grootte bespaar geheue as jou GPU minder VRAM het en help om Uit-van-Geheue probleme te bestuur.", + "tsscale_label": "Teks Segmentasie Skale", + "tsscale_info": "Verdeel teks in segmente deur sinne, woorde, of karakters. Woord en karakter segmentasie bied fyner granulariteit, nuttig vir onderskrifte; deaktiveer vertaling behou oorspronklike struktuur.", + "srt_file_label": "Laai 'n SRT onderskriflêer op (sal gebruik word in plaas van die transkripsie van Whisper)", + "divide_text_label": "Her-verdeel teks segmente deur:", + "divide_text_info": "(Eksperimenteel) Voer 'n skeier in om bestaande teks segmente in die brontaal te verdeel. Die hulpmiddel sal voorkomste identifiseer en nuwe segmente dienooreenkomstig skep. Spesifiseer verskeie skeiers met behulp van |, bv.: !|?|...|。", + "diarization_label": "Diarisering model", + "tr_process_label": "Vertaal proses", + "out_type_label": "Uitvoer tipe", + "out_name_label": "Lêer naam", + "out_name_info": "Die naam van die uitvoer lêer", + "task_sound_label": "Taak Status Klank", + "task_sound_info": "Taak Status Klank: Speel 'n klank waarskuwing wat taak voltooiing of foute tydens uitvoering aandui.", + "cache_label": "Herstel Vordering", + "cache_info": "Herstel Vordering: Gaan voort met die proses vanaf die laaste kontrolepunt.", + "preview_info": "Voorskou sny die video tot slegs 10 sekondes vir toetsdoeleindes. Skakel dit asseblief af om die volle video duur te kry.", + "edit_sub_label": "Wysig gegenereerde onderskrifte", + "edit_sub_info": "Wysig gegenereerde onderskrifte: Laat jou toe om die vertaling in 2 stappe uit te voer. Eerstens met die 'KRY ONDERSKRIFTE EN WYSIG' knoppie, kry jy die onderskrifte om dit te wysig, en dan met die 'VERTAAL' knoppie, kan jy die video genereer.", + "button_subs": "KRY ONDERSKRIFTE EN WYSIG", + "editor_sub_label": "Gegenereerde onderskrifte", + "editor_sub_info": "Voel vry om die teks in die gegenereerde onderskrifte hier te wysig. Jy kan veranderinge aan die koppelvlak opsies maak voordat jy die 'VERTAAL' knoppie druk, behalwe vir 'Bron taal', 'Vertaal klank na', en 'Max sprekers', om foute te vermy. Sodra jy klaar is, klik die 'VERTAAL' knoppie.", + "editor_sub_ph": "Druk eers 'KRY ONDERSKRIFTE EN WYSIG' om die onderskrifte te kry", + "button_translate": "VERTAAL", + "output_result_label": "LAAI VERTAALDE VIDEO AF", + "sub_ori": "Onderskrifte", + "sub_tra": "Vertaalde onderskrifte", + "ht_token_info": "Een belangrike stap is om die lisensie-ooreenkoms te aanvaar vir die gebruik van Pyannote. Jy moet 'n rekening hê op Hugging Face en die lisensie aanvaar om die modelle te gebruik: https://huggingface.co/pyannote/speaker-diarization en https://huggingface.co/pyannote/segmentation. Kry jou SLEUTEL TOKEN hier: https://hf.co/settings/tokens", + "ht_token_ph": "Token gaan hier...", + "tab_docs": "Dokument vertaling", + "docs_input_label": "Kies Dokument Bron", + "docs_input_info": "Dit kan 'n PDF, DOCX, TXT, of teks wees", + "docs_source_info": "Dit is die oorspronklike taal van die teks", + "chunk_size_label": "Max aantal karakters wat die TTS per segment sal verwerk", + "chunk_size_info": "'n Waarde van 0 ken 'n dinamiese en meer versoenbare waarde toe vir die TTS.", + "docs_button": "Begin Taal Omskakelingsbrug", + "cv_url_info": "Laai outomaties die R.V.C. modelle af van die URL. Jy kan skakels van HuggingFace of Drive gebruik, en jy kan verskeie skakels insluit, elkeen geskei deur 'n komma. Voorbeeld: https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.pth, https://huggingface.co/sail-rvc/yoimiya-jp/blob/main/model.index", + "replace_title": "Vervang stem: TTS na R.V.C.", + "sec1_title": "### 1. Om die gebruik te aktiveer, merk dit as aktief.", + "enable_replace": "Merk dit om die gebruik van die modelle te aktiveer.", + "sec2_title": "### 2. Kies 'n stem wat toegepas sal word op elke TTS van elke ooreenstemmende spreker en pas die konfigurasies toe.", + "sec2_subtitle": "Afhangende van hoeveel jy sal gebruik, benodig elkeen sy onderskeie model. Daar is ook 'n hulp een indien 'n spreker nie korrek opgespoor word nie.", + "cv_tts1": "Kies die stem om toe te pas vir Spreker 1.", + "cv_tts2": "Kies die stem om toe te pas vir Spreker 2.", + "cv_tts3": "Kies die stem om toe te pas vir Spreker 3.", + "cv_tts4": "Kies die stem om toe te pas vir Spreker 4.", + "cv_tts5": "Kies die stem om toe te pas vir Spreker 5.", + "cv_tts6": "Kies die stem om toe te pas vir Spreker 6.", + "cv_tts7": "Kies die stem om toe te pas vir Spreker 7.", + "cv_tts8": "Kies die stem om toe te pas vir Spreker 8.", + "cv_tts9": "Kies die stem om toe te pas vir Spreker 9.", + "cv_tts10": "Kies die stem om toe te pas vir Spreker 10.", + "cv_tts11": "Kies die stem om toe te pas vir Spreker 11.", + "cv_tts12": "Kies die stem om toe te pas vir Spreker 12.", + "cv_aux": "- Stem om toe te pas in geval 'n Spreker nie suksesvol opgespoor word nie.", + "cv_button_apply": "PAS KONFIGURASIE TOE", + "tab_help": "Hulp", + }, +} diff --git a/soni_translate/logging_setup.py b/soni_translate/logging_setup.py new file mode 100644 index 0000000000000000000000000000000000000000..28f0ce6cd6ed05d44d87a7a92b7abb673502882c --- /dev/null +++ b/soni_translate/logging_setup.py @@ -0,0 +1,68 @@ +import logging +import sys +import warnings +import os + + +def configure_logging_libs(debug=False): + warnings.filterwarnings( + action="ignore", category=UserWarning, module="pyannote" + ) + modules = [ + "numba", "httpx", "markdown_it", "speechbrain", "fairseq", "pyannote", + "faiss", + "pytorch_lightning.utilities.migration.utils", + "pytorch_lightning.utilities.migration", + "pytorch_lightning", + "lightning", + "lightning.pytorch.utilities.migration.utils", + ] + try: + for module in modules: + logging.getLogger(module).setLevel(logging.WARNING) + os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" if not debug else "1" + + # fix verbose pyannote audio + def fix_verbose_pyannote(*args, what=""): + pass + import pyannote.audio.core.model # noqa + pyannote.audio.core.model.check_version = fix_verbose_pyannote + except Exception as error: + logger.error(str(error)) + + +def setup_logger(name_log): + logger = logging.getLogger(name_log) + logger.setLevel(logging.INFO) + + _default_handler = logging.StreamHandler() # Set sys.stderr as stream. + _default_handler.flush = sys.stderr.flush + logger.addHandler(_default_handler) + + logger.propagate = False + + handlers = logger.handlers + + for handler in handlers: + formatter = logging.Formatter("[%(levelname)s] >> %(message)s") + handler.setFormatter(formatter) + + # logger.handlers + + return logger + + +logger = setup_logger("sonitranslate") +logger.setLevel(logging.INFO) + + +def set_logging_level(verbosity_level): + logging_level_mapping = { + "debug": logging.DEBUG, + "info": logging.INFO, + "warning": logging.WARNING, + "error": logging.ERROR, + "critical": logging.CRITICAL, + } + + logger.setLevel(logging_level_mapping.get(verbosity_level, logging.INFO)) diff --git a/soni_translate/mdx_net.py b/soni_translate/mdx_net.py new file mode 100644 index 0000000000000000000000000000000000000000..1623ecd17a29eb663964c94412c11f26e27d23c4 --- /dev/null +++ b/soni_translate/mdx_net.py @@ -0,0 +1,582 @@ +import gc +import hashlib +import os +import queue +import threading +import json +import shlex +import sys +import subprocess +import librosa +import numpy as np +import soundfile as sf +import torch +from tqdm import tqdm + +try: + from .utils import ( + remove_directory_contents, + create_directories, + ) +except: # noqa + from utils import ( + remove_directory_contents, + create_directories, + ) +from .logging_setup import logger + +try: + import onnxruntime as ort +except Exception as error: + logger.error(str(error)) +# import warnings +# warnings.filterwarnings("ignore") + +stem_naming = { + "Vocals": "Instrumental", + "Other": "Instruments", + "Instrumental": "Vocals", + "Drums": "Drumless", + "Bass": "Bassless", +} + + +class MDXModel: + def __init__( + self, + device, + dim_f, + dim_t, + n_fft, + hop=1024, + stem_name=None, + compensation=1.000, + ): + self.dim_f = dim_f + self.dim_t = dim_t + self.dim_c = 4 + self.n_fft = n_fft + self.hop = hop + self.stem_name = stem_name + self.compensation = compensation + + self.n_bins = self.n_fft // 2 + 1 + self.chunk_size = hop * (self.dim_t - 1) + self.window = torch.hann_window( + window_length=self.n_fft, periodic=True + ).to(device) + + out_c = self.dim_c + + self.freq_pad = torch.zeros( + [1, out_c, self.n_bins - self.dim_f, self.dim_t] + ).to(device) + + def stft(self, x): + x = x.reshape([-1, self.chunk_size]) + x = torch.stft( + x, + n_fft=self.n_fft, + hop_length=self.hop, + window=self.window, + center=True, + return_complex=True, + ) + x = torch.view_as_real(x) + x = x.permute([0, 3, 1, 2]) + x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( + [-1, 4, self.n_bins, self.dim_t] + ) + return x[:, :, : self.dim_f] + + def istft(self, x, freq_pad=None): + freq_pad = ( + self.freq_pad.repeat([x.shape[0], 1, 1, 1]) + if freq_pad is None + else freq_pad + ) + x = torch.cat([x, freq_pad], -2) + # c = 4*2 if self.target_name=='*' else 2 + x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( + [-1, 2, self.n_bins, self.dim_t] + ) + x = x.permute([0, 2, 3, 1]) + x = x.contiguous() + x = torch.view_as_complex(x) + x = torch.istft( + x, + n_fft=self.n_fft, + hop_length=self.hop, + window=self.window, + center=True, + ) + return x.reshape([-1, 2, self.chunk_size]) + + +class MDX: + DEFAULT_SR = 44100 + # Unit: seconds + DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR + DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR + + def __init__( + self, model_path: str, params: MDXModel, processor=0 + ): + # Set the device and the provider (CPU or CUDA) + self.device = ( + torch.device(f"cuda:{processor}") + if processor >= 0 + else torch.device("cpu") + ) + self.provider = ( + ["CUDAExecutionProvider"] + if processor >= 0 + else ["CPUExecutionProvider"] + ) + + self.model = params + + # Load the ONNX model using ONNX Runtime + self.ort = ort.InferenceSession(model_path, providers=self.provider) + # Preload the model for faster performance + self.ort.run( + None, + {"input": torch.rand(1, 4, params.dim_f, params.dim_t).numpy()}, + ) + self.process = lambda spec: self.ort.run( + None, {"input": spec.cpu().numpy()} + )[0] + + self.prog = None + + @staticmethod + def get_hash(model_path): + try: + with open(model_path, "rb") as f: + f.seek(-10000 * 1024, 2) + model_hash = hashlib.md5(f.read()).hexdigest() + except: # noqa + model_hash = hashlib.md5(open(model_path, "rb").read()).hexdigest() + + return model_hash + + @staticmethod + def segment( + wave, + combine=True, + chunk_size=DEFAULT_CHUNK_SIZE, + margin_size=DEFAULT_MARGIN_SIZE, + ): + """ + Segment or join segmented wave array + + Args: + wave: (np.array) Wave array to be segmented or joined + combine: (bool) If True, combines segmented wave array. + If False, segments wave array. + chunk_size: (int) Size of each segment (in samples) + margin_size: (int) Size of margin between segments (in samples) + + Returns: + numpy array: Segmented or joined wave array + """ + + if combine: + # Initializing as None instead of [] for later numpy array concatenation + processed_wave = None + for segment_count, segment in enumerate(wave): + start = 0 if segment_count == 0 else margin_size + end = None if segment_count == len(wave) - 1 else -margin_size + if margin_size == 0: + end = None + if processed_wave is None: # Create array for first segment + processed_wave = segment[:, start:end] + else: # Concatenate to existing array for subsequent segments + processed_wave = np.concatenate( + (processed_wave, segment[:, start:end]), axis=-1 + ) + + else: + processed_wave = [] + sample_count = wave.shape[-1] + + if chunk_size <= 0 or chunk_size > sample_count: + chunk_size = sample_count + + if margin_size > chunk_size: + margin_size = chunk_size + + for segment_count, skip in enumerate( + range(0, sample_count, chunk_size) + ): + margin = 0 if segment_count == 0 else margin_size + end = min(skip + chunk_size + margin_size, sample_count) + start = skip - margin + + cut = wave[:, start:end].copy() + processed_wave.append(cut) + + if end == sample_count: + break + + return processed_wave + + def pad_wave(self, wave): + """ + Pad the wave array to match the required chunk size + + Args: + wave: (np.array) Wave array to be padded + + Returns: + tuple: (padded_wave, pad, trim) + - padded_wave: Padded wave array + - pad: Number of samples that were padded + - trim: Number of samples that were trimmed + """ + n_sample = wave.shape[1] + trim = self.model.n_fft // 2 + gen_size = self.model.chunk_size - 2 * trim + pad = gen_size - n_sample % gen_size + + # Padded wave + wave_p = np.concatenate( + ( + np.zeros((2, trim)), + wave, + np.zeros((2, pad)), + np.zeros((2, trim)), + ), + 1, + ) + + mix_waves = [] + for i in range(0, n_sample + pad, gen_size): + waves = np.array(wave_p[:, i:i + self.model.chunk_size]) + mix_waves.append(waves) + + mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to( + self.device + ) + + return mix_waves, pad, trim + + def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int): + """ + Process each wave segment in a multi-threaded environment + + Args: + mix_waves: (torch.Tensor) Wave segments to be processed + trim: (int) Number of samples trimmed during padding + pad: (int) Number of samples padded during padding + q: (queue.Queue) Queue to hold the processed wave segments + _id: (int) Identifier of the processed wave segment + + Returns: + numpy array: Processed wave segment + """ + mix_waves = mix_waves.split(1) + with torch.no_grad(): + pw = [] + for mix_wave in mix_waves: + self.prog.update() + spec = self.model.stft(mix_wave) + processed_spec = torch.tensor(self.process(spec)) + processed_wav = self.model.istft( + processed_spec.to(self.device) + ) + processed_wav = ( + processed_wav[:, :, trim:-trim] + .transpose(0, 1) + .reshape(2, -1) + .cpu() + .numpy() + ) + pw.append(processed_wav) + processed_signal = np.concatenate(pw, axis=-1)[:, :-pad] + q.put({_id: processed_signal}) + return processed_signal + + def process_wave(self, wave: np.array, mt_threads=1): + """ + Process the wave array in a multi-threaded environment + + Args: + wave: (np.array) Wave array to be processed + mt_threads: (int) Number of threads to be used for processing + + Returns: + numpy array: Processed wave array + """ + self.prog = tqdm(total=0) + chunk = wave.shape[-1] // mt_threads + waves = self.segment(wave, False, chunk) + + # Create a queue to hold the processed wave segments + q = queue.Queue() + threads = [] + for c, batch in enumerate(waves): + mix_waves, pad, trim = self.pad_wave(batch) + self.prog.total = len(mix_waves) * mt_threads + thread = threading.Thread( + target=self._process_wave, args=(mix_waves, trim, pad, q, c) + ) + thread.start() + threads.append(thread) + for thread in threads: + thread.join() + self.prog.close() + + processed_batches = [] + while not q.empty(): + processed_batches.append(q.get()) + processed_batches = [ + list(wave.values())[0] + for wave in sorted( + processed_batches, key=lambda d: list(d.keys())[0] + ) + ] + assert len(processed_batches) == len( + waves + ), "Incomplete processed batches, please reduce batch size!" + return self.segment(processed_batches, True, chunk) + + +def run_mdx( + model_params, + output_dir, + model_path, + filename, + exclude_main=False, + exclude_inversion=False, + suffix=None, + invert_suffix=None, + denoise=False, + keep_orig=True, + m_threads=2, + device_base="cuda", +): + if device_base == "cuda": + device = torch.device("cuda:0") + processor_num = 0 + device_properties = torch.cuda.get_device_properties(device) + vram_gb = device_properties.total_memory / 1024**3 + m_threads = 1 if vram_gb < 8 else 2 + else: + device = torch.device("cpu") + processor_num = -1 + m_threads = 1 + + model_hash = MDX.get_hash(model_path) + mp = model_params.get(model_hash) + model = MDXModel( + device, + dim_f=mp["mdx_dim_f_set"], + dim_t=2 ** mp["mdx_dim_t_set"], + n_fft=mp["mdx_n_fft_scale_set"], + stem_name=mp["primary_stem"], + compensation=mp["compensate"], + ) + + mdx_sess = MDX(model_path, model, processor=processor_num) + wave, sr = librosa.load(filename, mono=False, sr=44100) + # normalizing input wave gives better output + peak = max(np.max(wave), abs(np.min(wave))) + wave /= peak + if denoise: + wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + ( + mdx_sess.process_wave(wave, m_threads) + ) + wave_processed *= 0.5 + else: + wave_processed = mdx_sess.process_wave(wave, m_threads) + # return to previous peak + wave_processed *= peak + stem_name = model.stem_name if suffix is None else suffix + + main_filepath = None + if not exclude_main: + main_filepath = os.path.join( + output_dir, + f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav", + ) + sf.write(main_filepath, wave_processed.T, sr) + + invert_filepath = None + if not exclude_inversion: + diff_stem_name = ( + stem_naming.get(stem_name) + if invert_suffix is None + else invert_suffix + ) + stem_name = ( + f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name + ) + invert_filepath = os.path.join( + output_dir, + f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav", + ) + sf.write( + invert_filepath, + (-wave_processed.T * model.compensation) + wave.T, + sr, + ) + + if not keep_orig: + os.remove(filename) + + del mdx_sess, wave_processed, wave + gc.collect() + torch.cuda.empty_cache() + return main_filepath, invert_filepath + + +MDX_DOWNLOAD_LINK = "https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/" +UVR_MODELS = [ + "UVR-MDX-NET-Voc_FT.onnx", + "UVR_MDXNET_KARA_2.onnx", + "Reverb_HQ_By_FoxJoy.onnx", + "UVR-MDX-NET-Inst_HQ_4.onnx", +] +BASE_DIR = "." # os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +mdxnet_models_dir = os.path.join(BASE_DIR, "mdx_models") +output_dir = os.path.join(BASE_DIR, "clean_song_output") + + +def convert_to_stereo_and_wav(audio_path): + wave, sr = librosa.load(audio_path, mono=False, sr=44100) + + # check if mono + if type(wave[0]) != np.ndarray or audio_path[-4:].lower() != ".wav": # noqa + stereo_path = f"{os.path.splitext(audio_path)[0]}_stereo.wav" + stereo_path = os.path.join(output_dir, stereo_path) + + command = shlex.split( + f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 2 -f wav "{stereo_path}"' + ) + sub_params = { + "stdout": subprocess.PIPE, + "stderr": subprocess.PIPE, + "creationflags": subprocess.CREATE_NO_WINDOW + if sys.platform == "win32" + else 0, + } + process_wav = subprocess.Popen(command, **sub_params) + output, errors = process_wav.communicate() + if process_wav.returncode != 0 or not os.path.exists(stereo_path): + raise Exception("Error processing audio to stereo wav") + + return stereo_path + else: + return audio_path + + +def process_uvr_task( + orig_song_path: str = "aud_test.mp3", + main_vocals: bool = False, + dereverb: bool = True, + song_id: str = "mdx", # folder output name + only_voiceless: bool = False, + remove_files_output_dir: bool = False, +): + if os.environ.get("SONITR_DEVICE") == "cpu": + device_base = "cpu" + else: + device_base = "cuda" if torch.cuda.is_available() else "cpu" + + if remove_files_output_dir: + remove_directory_contents(output_dir) + + with open(os.path.join(mdxnet_models_dir, "data.json")) as infile: + mdx_model_params = json.load(infile) + + song_output_dir = os.path.join(output_dir, song_id) + create_directories(song_output_dir) + orig_song_path = convert_to_stereo_and_wav(orig_song_path) + + logger.debug(f"onnxruntime device >> {ort.get_device()}") + + if only_voiceless: + logger.info("Voiceless Track Separation...") + return run_mdx( + mdx_model_params, + song_output_dir, + os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Inst_HQ_4.onnx"), + orig_song_path, + suffix="Voiceless", + denoise=False, + keep_orig=True, + exclude_inversion=True, + device_base=device_base, + ) + + logger.info("Vocal Track Isolation and Voiceless Track Separation...") + vocals_path, instrumentals_path = run_mdx( + mdx_model_params, + song_output_dir, + os.path.join(mdxnet_models_dir, "UVR-MDX-NET-Voc_FT.onnx"), + orig_song_path, + denoise=True, + keep_orig=True, + device_base=device_base, + ) + + if main_vocals: + logger.info("Main Voice Separation from Supporting Vocals...") + backup_vocals_path, main_vocals_path = run_mdx( + mdx_model_params, + song_output_dir, + os.path.join(mdxnet_models_dir, "UVR_MDXNET_KARA_2.onnx"), + vocals_path, + suffix="Backup", + invert_suffix="Main", + denoise=True, + device_base=device_base, + ) + else: + backup_vocals_path, main_vocals_path = None, vocals_path + + if dereverb: + logger.info("Vocal Clarity Enhancement through De-Reverberation...") + _, vocals_dereverb_path = run_mdx( + mdx_model_params, + song_output_dir, + os.path.join(mdxnet_models_dir, "Reverb_HQ_By_FoxJoy.onnx"), + main_vocals_path, + invert_suffix="DeReverb", + exclude_main=True, + denoise=True, + device_base=device_base, + ) + else: + vocals_dereverb_path = main_vocals_path + + return ( + vocals_path, + instrumentals_path, + backup_vocals_path, + main_vocals_path, + vocals_dereverb_path, + ) + + +if __name__ == "__main__": + from utils import download_manager + + for id_model in UVR_MODELS: + download_manager( + os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir + ) + ( + vocals_path_, + instrumentals_path_, + backup_vocals_path_, + main_vocals_path_, + vocals_dereverb_path_, + ) = process_uvr_task( + orig_song_path="aud.mp3", + main_vocals=True, + dereverb=True, + song_id="mdx", + remove_files_output_dir=True, + ) diff --git a/soni_translate/postprocessor.py b/soni_translate/postprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..61a54eb085a26d62363ec0fff0542c27767973f0 --- /dev/null +++ b/soni_translate/postprocessor.py @@ -0,0 +1,229 @@ +from .utils import remove_files, run_command +from .text_multiformat_processor import get_subtitle +from .logging_setup import logger +import unicodedata +import shutil +import copy +import os +import re + +OUTPUT_TYPE_OPTIONS = [ + "video (mp4)", + "video (mkv)", + "audio (mp3)", + "audio (ogg)", + "audio (wav)", + "subtitle", + "subtitle [by speaker]", + "video [subtitled] (mp4)", + "video [subtitled] (mkv)", + "audio [original vocal sound]", + "audio [original background sound]", + "audio [original vocal and background sound]", + "audio [original vocal-dereverb sound]", + "audio [original vocal-dereverb and background sound]", + "raw media", +] + +DOCS_OUTPUT_TYPE_OPTIONS = [ + "videobook (mp4)", + "videobook (mkv)", + "audiobook (wav)", + "audiobook (mp3)", + "audiobook (ogg)", + "book (txt)", +] # Add DOCX and etc. + + +def get_no_ext_filename(file_path): + file_name_with_extension = os.path.basename(rf"{file_path}") + filename_without_extension, _ = os.path.splitext(file_name_with_extension) + return filename_without_extension + + +def get_video_info(link): + aux_name = f"video_url_{link}" + params_dlp = {"quiet": True, "no_warnings": True, "noplaylist": True} + try: + from yt_dlp import YoutubeDL + + with YoutubeDL(params_dlp) as ydl: + if link.startswith(("www.youtube.com/", "m.youtube.com/")): + link = "https://" + link + info_dict = ydl.extract_info(link, download=False, process=False) + video_id = info_dict.get("id", aux_name) + video_title = info_dict.get("title", video_id) + if "youtube.com" in link and "&list=" in link: + video_title = ydl.extract_info( + "https://m.youtube.com/watch?v="+video_id, + download=False, + process=False + ).get("title", video_title) + except Exception as error: + logger.error(str(error)) + video_title, video_id = aux_name, "NO_ID" + return video_title, video_id + + +def sanitize_file_name(file_name): + # Normalize the string to NFKD form to separate combined + # characters into base characters and diacritics + normalized_name = unicodedata.normalize("NFKD", file_name) + # Replace any non-ASCII characters or special symbols with an underscore + sanitized_name = re.sub(r"[^\w\s.-]", "_", normalized_name) + return sanitized_name + + +def get_output_file( + original_file, + new_file_name, + soft_subtitles, + output_directory="", +): + directory_base = "." # default directory + + if output_directory and os.path.isdir(output_directory): + new_file_path = os.path.join(output_directory, new_file_name) + else: + new_file_path = os.path.join(directory_base, "outputs", new_file_name) + remove_files(new_file_path) + + cm = None + if soft_subtitles and original_file.endswith(".mp4"): + if new_file_path.endswith(".mp4"): + cm = f'ffmpeg -y -i "{original_file}" -i sub_tra.srt -i sub_ori.srt -map 0:v -map 0:a -map 1 -map 2 -c:v copy -c:a copy -c:s mov_text "{new_file_path}"' + else: + cm = f'ffmpeg -y -i "{original_file}" -i sub_tra.srt -i sub_ori.srt -map 0:v -map 0:a -map 1 -map 2 -c:v copy -c:a copy -c:s srt -movflags use_metadata_tags -map_metadata 0 "{new_file_path}"' + elif new_file_path.endswith(".mkv"): + cm = f'ffmpeg -i "{original_file}" -c:v copy -c:a copy "{new_file_path}"' + elif new_file_path.endswith(".wav") and not original_file.endswith(".wav"): + cm = f'ffmpeg -y -i "{original_file}" -acodec pcm_s16le -ar 44100 -ac 2 "{new_file_path}"' + elif new_file_path.endswith(".ogg"): + cm = f'ffmpeg -i "{original_file}" -c:a libvorbis "{new_file_path}"' + elif new_file_path.endswith(".mp3") and not original_file.endswith(".mp3"): + cm = f'ffmpeg -y -i "{original_file}" -codec:a libmp3lame -qscale:a 2 "{new_file_path}"' + + if cm: + try: + run_command(cm) + except Exception as error: + logger.error(str(error)) + remove_files(new_file_path) + shutil.copy2(original_file, new_file_path) + else: + shutil.copy2(original_file, new_file_path) + + return os.path.abspath(new_file_path) + + +def media_out( + media_file, + lang_code, + media_out_name="", + extension="mp4", + file_obj="video_dub.mp4", + soft_subtitles=False, + subtitle_files="disable", +): + if not media_out_name: + if os.path.exists(media_file): + base_name = get_no_ext_filename(media_file) + else: + base_name, _ = get_video_info(media_file) + + media_out_name = f"{base_name}__{lang_code}" + + f_name = f"{sanitize_file_name(media_out_name)}.{extension}" + + if subtitle_files != "disable": + final_media = [get_output_file(file_obj, f_name, soft_subtitles)] + name_tra = f"{sanitize_file_name(media_out_name)}.{subtitle_files}" + name_ori = f"{sanitize_file_name(base_name)}.{subtitle_files}" + tgt_subs = f"sub_tra.{subtitle_files}" + ori_subs = f"sub_ori.{subtitle_files}" + final_subtitles = [ + get_output_file(tgt_subs, name_tra, False), + get_output_file(ori_subs, name_ori, False) + ] + return final_media + final_subtitles + else: + return get_output_file(file_obj, f_name, soft_subtitles) + + +def get_subtitle_speaker(media_file, result, language, extension, base_name): + + segments_base = copy.deepcopy(result) + + # Sub segments by speaker + segments_by_speaker = {} + for segment in segments_base["segments"]: + if segment["speaker"] not in segments_by_speaker.keys(): + segments_by_speaker[segment["speaker"]] = [segment] + else: + segments_by_speaker[segment["speaker"]].append(segment) + + if not base_name: + if os.path.exists(media_file): + base_name = get_no_ext_filename(media_file) + else: + base_name, _ = get_video_info(media_file) + + files_subs = [] + for name_sk, segments in segments_by_speaker.items(): + + subtitle_speaker = get_subtitle( + language, + {"segments": segments}, + extension, + filename=name_sk, + ) + + media_out_name = f"{base_name}_{language}_{name_sk}" + + output = media_out( + media_file, # no need + language, + media_out_name, + extension, + file_obj=subtitle_speaker, + ) + + files_subs.append(output) + + return files_subs + + +def sound_separate(media_file, task_uvr): + from .mdx_net import process_uvr_task + + outputs = [] + + if "vocal" in task_uvr: + try: + _, _, _, _, vocal_audio = process_uvr_task( + orig_song_path=media_file, + main_vocals=False, + dereverb=True if "dereverb" in task_uvr else False, + remove_files_output_dir=True, + ) + outputs.append(vocal_audio) + except Exception as error: + logger.error(str(error)) + + if "background" in task_uvr: + try: + background_audio, _ = process_uvr_task( + orig_song_path=media_file, + song_id="voiceless", + only_voiceless=True, + remove_files_output_dir=False if "vocal" in task_uvr else True, + ) + # copy_files(background_audio, ".") + outputs.append(background_audio) + except Exception as error: + logger.error(str(error)) + + if not outputs: + raise Exception("Error in uvr process") + + return outputs diff --git a/soni_translate/preprocessor.py b/soni_translate/preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..9eb115d169904ce7ccc4513a4010583cb2dfa0a4 --- /dev/null +++ b/soni_translate/preprocessor.py @@ -0,0 +1,308 @@ +from .utils import remove_files +import os, shutil, subprocess, time, shlex, sys # noqa +from .logging_setup import logger +import json + +ERROR_INCORRECT_CODEC_PARAMETERS = [ + "prores", # mov + "ffv1", # mkv + "msmpeg4v3", # avi + "wmv2", # wmv + "theora", # ogv +] # fix final merge + +TESTED_CODECS = [ + "h264", # mp4 + "h265", # mp4 + "vp9", # webm + "mpeg4", # mp4 + "mpeg2video", # mpg + "mjpeg", # avi +] + + +class OperationFailedError(Exception): + def __init__(self, message="The operation did not complete successfully."): + self.message = message + super().__init__(self.message) + + +def get_video_codec(video_file): + command_base = rf'ffprobe -v error -select_streams v:0 -show_entries stream=codec_name -of json "{video_file}"' + command = shlex.split(command_base) + try: + process = subprocess.Popen( + command, + stdout=subprocess.PIPE, + creationflags=subprocess.CREATE_NO_WINDOW if sys.platform == "win32" else 0, + ) + output, _ = process.communicate() + codec_info = json.loads(output.decode('utf-8')) + codec_name = codec_info['streams'][0]['codec_name'] + return codec_name + except Exception as error: + logger.debug(str(error)) + return None + + +def audio_preprocessor(preview, base_audio, audio_wav, use_cuda=False): + base_audio = base_audio.strip() + previous_files_to_remove = [audio_wav] + remove_files(previous_files_to_remove) + + if preview: + logger.warning( + "Creating a preview video of 10 seconds, to disable " + "this option, go to advanced settings and turn off preview." + ) + wav_ = f'ffmpeg -y -i "{base_audio}" -ss 00:00:20 -t 00:00:10 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav' + else: + wav_ = f'ffmpeg -y -i "{base_audio}" -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav' + + # Run cmd process + sub_params = { + "stdout": subprocess.PIPE, + "stderr": subprocess.PIPE, + "creationflags": subprocess.CREATE_NO_WINDOW + if sys.platform == "win32" + else 0, + } + wav_ = shlex.split(wav_) + result_convert_audio = subprocess.Popen(wav_, **sub_params) + output, errors = result_convert_audio.communicate() + time.sleep(1) + if result_convert_audio.returncode in [1, 2] or not os.path.exists( + audio_wav + ): + raise OperationFailedError(f"Error can't create the audio file:\n{errors.decode('utf-8')}") + + +def audio_video_preprocessor( + preview, video, OutputFile, audio_wav, use_cuda=False +): + video = video.strip() + previous_files_to_remove = [OutputFile, "audio.webm", audio_wav] + remove_files(previous_files_to_remove) + + if os.path.exists(video): + if preview: + logger.warning( + "Creating a preview video of 10 seconds, " + "to disable this option, go to advanced " + "settings and turn off preview." + ) + mp4_ = f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4' + else: + video_codec = get_video_codec(video) + if not video_codec: + logger.debug("No video codec found in video") + else: + logger.info(f"Video codec: {video_codec}") + + # Check if the file ends with ".mp4" extension or is valid codec + if video.endswith(".mp4") or video_codec in TESTED_CODECS: + destination_path = os.path.join(os.getcwd(), "Video.mp4") + shutil.copy(video, destination_path) + time.sleep(0.5) + if os.path.exists(OutputFile): + mp4_ = "ffmpeg -h" + else: + mp4_ = f'ffmpeg -y -i "{video}" -c copy Video.mp4' + else: + logger.warning( + "File does not have the '.mp4' extension or a " + "supported codec. Converting video to mp4 (codec: h264)." + ) + mp4_ = f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4' + else: + if preview: + logger.warning( + "Creating a preview from the link, 10 seconds " + "to disable this option, go to advanced " + "settings and turn off preview." + ) + # https://github.com/yt-dlp/yt-dlp/issues/2220 + mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' + wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav" + else: + mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' + wav_ = f"python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-playlist --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}" + + # Run cmd process + mp4_ = shlex.split(mp4_) + sub_params = { + "stdout": subprocess.PIPE, + "stderr": subprocess.PIPE, + "creationflags": subprocess.CREATE_NO_WINDOW + if sys.platform == "win32" + else 0, + } + + if os.path.exists(video): + logger.info("Process video...") + result_convert_video = subprocess.Popen(mp4_, **sub_params) + # result_convert_video.wait() + output, errors = result_convert_video.communicate() + time.sleep(1) + if result_convert_video.returncode in [1, 2] or not os.path.exists( + OutputFile + ): + raise OperationFailedError(f"Error processing video:\n{errors.decode('utf-8')}") + logger.info("Process audio...") + wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav" + wav_ = shlex.split(wav_) + result_convert_audio = subprocess.Popen(wav_, **sub_params) + output, errors = result_convert_audio.communicate() + time.sleep(1) + if result_convert_audio.returncode in [1, 2] or not os.path.exists( + audio_wav + ): + raise OperationFailedError(f"Error can't create the audio file:\n{errors.decode('utf-8')}") + + else: + wav_ = shlex.split(wav_) + if preview: + result_convert_video = subprocess.Popen(mp4_, **sub_params) + output, errors = result_convert_video.communicate() + time.sleep(0.5) + result_convert_audio = subprocess.Popen(wav_, **sub_params) + output, errors = result_convert_audio.communicate() + time.sleep(0.5) + if result_convert_audio.returncode in [1, 2] or not os.path.exists( + audio_wav + ): + raise OperationFailedError( + f"Error can't create the preview file:\n{errors.decode('utf-8')}" + ) + else: + logger.info("Process audio...") + result_convert_audio = subprocess.Popen(wav_, **sub_params) + output, errors = result_convert_audio.communicate() + time.sleep(1) + if result_convert_audio.returncode in [1, 2] or not os.path.exists( + audio_wav + ): + raise OperationFailedError(f"Error can't download the audio:\n{errors.decode('utf-8')}") + logger.info("Process video...") + result_convert_video = subprocess.Popen(mp4_, **sub_params) + output, errors = result_convert_video.communicate() + time.sleep(1) + if result_convert_video.returncode in [1, 2] or not os.path.exists( + OutputFile + ): + raise OperationFailedError(f"Error can't download the video:\n{errors.decode('utf-8')}") + + +def old_audio_video_preprocessor(preview, video, OutputFile, audio_wav): + previous_files_to_remove = [OutputFile, "audio.webm", audio_wav] + remove_files(previous_files_to_remove) + + if os.path.exists(video): + if preview: + logger.warning( + "Creating a preview video of 10 seconds, " + "to disable this option, go to advanced " + "settings and turn off preview." + ) + command = f'ffmpeg -y -i "{video}" -ss 00:00:20 -t 00:00:10 -c:v libx264 -c:a aac -strict experimental Video.mp4' + result_convert_video = subprocess.run( + command, capture_output=True, text=True, shell=True + ) + else: + # Check if the file ends with ".mp4" extension + if video.endswith(".mp4"): + destination_path = os.path.join(os.getcwd(), "Video.mp4") + shutil.copy(video, destination_path) + result_convert_video = {} + result_convert_video = subprocess.run( + "echo Video copied", + capture_output=True, + text=True, + shell=True, + ) + else: + logger.warning( + "File does not have the '.mp4' extension. Converting video." + ) + command = f'ffmpeg -y -i "{video}" -c:v libx264 -c:a aac -strict experimental Video.mp4' + result_convert_video = subprocess.run( + command, capture_output=True, text=True, shell=True + ) + + if result_convert_video.returncode in [1, 2]: + raise OperationFailedError("Error can't convert the video") + + for i in range(120): + time.sleep(1) + logger.info("Process video...") + if os.path.exists(OutputFile): + time.sleep(1) + command = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav" + result_convert_audio = subprocess.run( + command, capture_output=True, text=True, shell=True + ) + time.sleep(1) + break + if i == 119: + # if not os.path.exists(OutputFile): + raise OperationFailedError("Error processing video") + + if result_convert_audio.returncode in [1, 2]: + raise OperationFailedError( + f"Error can't create the audio file: {result_convert_audio.stderr}" + ) + + for i in range(120): + time.sleep(1) + logger.info("Process audio...") + if os.path.exists(audio_wav): + break + if i == 119: + raise OperationFailedError("Error can't create the audio file") + + else: + video = video.strip() + if preview: + logger.warning( + "Creating a preview from the link, 10 " + "seconds to disable this option, go to " + "advanced settings and turn off preview." + ) + # https://github.com/yt-dlp/yt-dlp/issues/2220 + mp4_ = f'yt-dlp -f "mp4" --downloader ffmpeg --downloader-args "ffmpeg_i: -ss 00:00:20 -t 00:00:10" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' + wav_ = "ffmpeg -y -i Video.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 audio.wav" + result_convert_video = subprocess.run( + mp4_, capture_output=True, text=True, shell=True + ) + result_convert_audio = subprocess.run( + wav_, capture_output=True, text=True, shell=True + ) + if result_convert_audio.returncode in [1, 2]: + raise OperationFailedError("Error can't download a preview") + else: + mp4_ = f'yt-dlp -f "mp4" --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --restrict-filenames -o {OutputFile} {video}' + wav_ = f"python -m yt_dlp --output {audio_wav} --force-overwrites --max-downloads 1 --no-warnings --no-abort-on-error --ignore-no-formats-error --extract-audio --audio-format wav {video}" + + result_convert_audio = subprocess.run( + wav_, capture_output=True, text=True, shell=True + ) + + if result_convert_audio.returncode in [1, 2]: + raise OperationFailedError("Error can't download the audio") + + for i in range(120): + time.sleep(1) + logger.info("Process audio...") + if os.path.exists(audio_wav) and not os.path.exists( + "audio.webm" + ): + time.sleep(1) + result_convert_video = subprocess.run( + mp4_, capture_output=True, text=True, shell=True + ) + break + if i == 119: + raise OperationFailedError("Error downloading the audio") + + if result_convert_video.returncode in [1, 2]: + raise OperationFailedError("Error can't download the video") diff --git a/soni_translate/speech_segmentation.py b/soni_translate/speech_segmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..5f33bc9bb823d5ba7e18ffa2433c44ae5e825892 --- /dev/null +++ b/soni_translate/speech_segmentation.py @@ -0,0 +1,499 @@ +from whisperx.alignment import ( + DEFAULT_ALIGN_MODELS_TORCH as DAMT, + DEFAULT_ALIGN_MODELS_HF as DAMHF, +) +from whisperx.utils import TO_LANGUAGE_CODE +import whisperx +import torch +import gc +import os +import soundfile as sf +from IPython.utils import capture # noqa +from .language_configuration import EXTRA_ALIGN, INVERTED_LANGUAGES +from .logging_setup import logger +from .postprocessor import sanitize_file_name +from .utils import remove_directory_contents, run_command + +# ZERO GPU CONFIG +import spaces +import copy +import random +import time + +def random_sleep(): + if os.environ.get("ZERO_GPU") == "TRUE": + print("Random sleep") + sleep_time = round(random.uniform(7.2, 9.9), 1) + time.sleep(sleep_time) + + +@spaces.GPU(duration=120) +def load_and_transcribe_audio(asr_model, audio, compute_type, language, asr_options, batch_size, segment_duration_limit): + # Load model + model = whisperx.load_model( + asr_model, + os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda", + compute_type=compute_type, + language=language, + asr_options=asr_options, + ) + + # Transcribe audio + result = model.transcribe( + audio, + batch_size=batch_size, + chunk_size=segment_duration_limit, + print_progress=True, + ) + + del model + gc.collect() + torch.cuda.empty_cache() # noqa + + return result + +def load_align_and_align_segments(result, audio, DAMHF): + + # Load alignment model + model_a, metadata = whisperx.load_align_model( + language_code=result["language"], + device=os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda", + model_name=None + if result["language"] in DAMHF.keys() + else EXTRA_ALIGN[result["language"]], + ) + + # Align segments + alignment_result = whisperx.align( + result["segments"], + model_a, + metadata, + audio, + os.environ.get("SONITR_DEVICE") if os.environ.get("ZERO_GPU") != "TRUE" else "cuda", + return_char_alignments=True, + print_progress=False, + ) + + # Clean up + del model_a + gc.collect() + torch.cuda.empty_cache() # noqa + + return alignment_result + +@spaces.GPU(duration=120) +def diarize_audio(diarize_model, audio_wav, min_speakers, max_speakers): + + if os.environ.get("ZERO_GPU") == "TRUE": + diarize_model.model.to(torch.device("cuda")) + diarize_segments = diarize_model( + audio_wav, + min_speakers=min_speakers, + max_speakers=max_speakers + ) + return diarize_segments + +# ZERO GPU CONFIG + +ASR_MODEL_OPTIONS = [ + "tiny", + "base", + "small", + "medium", + "large", + "large-v1", + "large-v2", + "large-v3", + "distil-large-v2", + "Systran/faster-distil-whisper-large-v3", + "tiny.en", + "base.en", + "small.en", + "medium.en", + "distil-small.en", + "distil-medium.en", + "OpenAI_API_Whisper", +] + +COMPUTE_TYPE_GPU = [ + "default", + "auto", + "int8", + "int8_float32", + "int8_float16", + "int8_bfloat16", + "float16", + "bfloat16", + "float32" +] + +COMPUTE_TYPE_CPU = [ + "default", + "auto", + "int8", + "int8_float32", + "int16", + "float32", +] + +WHISPER_MODELS_PATH = './WHISPER_MODELS' + + +def openai_api_whisper( + input_audio_file, + source_lang=None, + chunk_duration=1800 +): + + info = sf.info(input_audio_file) + duration = info.duration + + output_directory = "./whisper_api_audio_parts" + os.makedirs(output_directory, exist_ok=True) + remove_directory_contents(output_directory) + + if duration > chunk_duration: + # Split the audio file into smaller chunks with 30-minute duration + cm = f'ffmpeg -i "{input_audio_file}" -f segment -segment_time {chunk_duration} -c:a libvorbis "{output_directory}/output%03d.ogg"' + run_command(cm) + # Get list of generated chunk files + chunk_files = sorted( + [f"{output_directory}/{f}" for f in os.listdir(output_directory) if f.endswith('.ogg')] + ) + else: + one_file = f"{output_directory}/output000.ogg" + cm = f'ffmpeg -i "{input_audio_file}" -c:a libvorbis {one_file}' + run_command(cm) + chunk_files = [one_file] + + # Transcript + segments = [] + language = source_lang if source_lang else None + for i, chunk in enumerate(chunk_files): + from openai import OpenAI + client = OpenAI() + + audio_file = open(chunk, "rb") + transcription = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + language=language, + response_format="verbose_json", + timestamp_granularities=["segment"], + ) + + try: + transcript_dict = transcription.model_dump() + except: # noqa + transcript_dict = transcription.to_dict() + + if language is None: + logger.info(f'Language detected: {transcript_dict["language"]}') + language = TO_LANGUAGE_CODE[transcript_dict["language"]] + + chunk_time = chunk_duration * (i) + + for seg in transcript_dict["segments"]: + + if "start" in seg.keys(): + segments.append( + { + "text": seg["text"], + "start": seg["start"] + chunk_time, + "end": seg["end"] + chunk_time, + } + ) + + audio = whisperx.load_audio(input_audio_file) + result = {"segments": segments, "language": language} + + return audio, result + + +def find_whisper_models(): + path = WHISPER_MODELS_PATH + folders = [] + + if os.path.exists(path): + for folder in os.listdir(path): + folder_path = os.path.join(path, folder) + if ( + os.path.isdir(folder_path) + and 'model.bin' in os.listdir(folder_path) + ): + folders.append(folder) + return folders + +def transcribe_speech( + audio_wav, + asr_model, + compute_type, + batch_size, + SOURCE_LANGUAGE, + literalize_numbers=True, + segment_duration_limit=15, +): + """ + Transcribe speech using a whisper model. + + Parameters: + - audio_wav (str): Path to the audio file in WAV format. + - asr_model (str): The whisper model to be loaded. + - compute_type (str): Type of compute to be used (e.g., 'int8', 'float16'). + - batch_size (int): Batch size for transcription. + - SOURCE_LANGUAGE (str): Source language for transcription. + + Returns: + - Tuple containing: + - audio: Loaded audio file. + - result: Transcription result as a dictionary. + """ + + if asr_model == "OpenAI_API_Whisper": + if literalize_numbers: + logger.info( + "OpenAI's API Whisper does not support " + "the literalization of numbers." + ) + return openai_api_whisper(audio_wav, SOURCE_LANGUAGE) + + # https://github.com/openai/whisper/discussions/277 + prompt = "以下是普通话的句子。" if SOURCE_LANGUAGE == "zh" else None + SOURCE_LANGUAGE = ( + SOURCE_LANGUAGE if SOURCE_LANGUAGE != "zh-TW" else "zh" + ) + asr_options = { + "initial_prompt": prompt, + "suppress_numerals": literalize_numbers + } + + if asr_model not in ASR_MODEL_OPTIONS: + + base_dir = WHISPER_MODELS_PATH + if not os.path.exists(base_dir): + os.makedirs(base_dir) + model_dir = os.path.join(base_dir, sanitize_file_name(asr_model)) + + if not os.path.exists(model_dir): + from ctranslate2.converters import TransformersConverter + + quantization = "float32" + # Download new model + try: + converter = TransformersConverter( + asr_model, + low_cpu_mem_usage=True, + copy_files=[ + "tokenizer_config.json", "preprocessor_config.json" + ] + ) + converter.convert( + model_dir, + quantization=quantization, + force=False + ) + except Exception as error: + if "File tokenizer_config.json does not exist" in str(error): + converter._copy_files = [ + "tokenizer.json", "preprocessor_config.json" + ] + converter.convert( + model_dir, + quantization=quantization, + force=True + ) + else: + raise error + + asr_model = model_dir + logger.info(f"ASR Model: {str(model_dir)}") + + audio = whisperx.load_audio(audio_wav) + + result = load_and_transcribe_audio( + asr_model, audio, compute_type, SOURCE_LANGUAGE, asr_options, batch_size, segment_duration_limit + ) + + if result["language"] == "zh" and not prompt: + result["language"] = "zh-TW" + logger.info("Chinese - Traditional (zh-TW)") + + + return audio, result + + +def align_speech(audio, result): + """ + Aligns speech segments based on the provided audio and result metadata. + + Parameters: + - audio (array): The audio data in a suitable format for alignment. + - result (dict): Metadata containing information about the segments + and language. + + Returns: + - result (dict): Updated metadata after aligning the segments with + the audio. This includes character-level alignments if + 'return_char_alignments' is set to True. + + Notes: + - This function uses language-specific models to align speech segments. + - It performs language compatibility checks and selects the + appropriate alignment model. + - Cleans up memory by releasing resources after alignment. + """ + DAMHF.update(DAMT) # lang align + if ( + not result["language"] in DAMHF.keys() + and not result["language"] in EXTRA_ALIGN.keys() + ): + logger.warning( + "Automatic detection: Source language not compatible with align" + ) + raise ValueError( + f"Detected language {result['language']} incompatible, " + "you can select the source language to avoid this error." + ) + if ( + result["language"] in EXTRA_ALIGN.keys() + and EXTRA_ALIGN[result["language"]] == "" + ): + lang_name = ( + INVERTED_LANGUAGES[result["language"]] + if result["language"] in INVERTED_LANGUAGES.keys() + else result["language"] + ) + logger.warning( + "No compatible wav2vec2 model found " + f"for the language '{lang_name}', skipping alignment." + ) + return result + + random_sleep() + result = load_align_and_align_segments(result, audio, DAMHF) + + return result + + +diarization_models = { + "pyannote_3.1": "pyannote/speaker-diarization-3.1", + "pyannote_2.1": "pyannote/speaker-diarization@2.1", + "disable": "", +} + + +def reencode_speakers(result): + + if result["segments"][0]["speaker"] == "SPEAKER_00": + return result + + speaker_mapping = {} + counter = 0 + + logger.debug("Reencode speakers") + + for segment in result["segments"]: + old_speaker = segment["speaker"] + if old_speaker not in speaker_mapping: + speaker_mapping[old_speaker] = f"SPEAKER_{counter:02d}" + counter += 1 + segment["speaker"] = speaker_mapping[old_speaker] + + return result + + +def diarize_speech( + audio_wav, + result, + min_speakers, + max_speakers, + YOUR_HF_TOKEN, + model_name="pyannote/speaker-diarization@2.1", +): + """ + Performs speaker diarization on speech segments. + + Parameters: + - audio_wav (array): Audio data in WAV format to perform speaker + diarization. + - result (dict): Metadata containing information about speech segments + and alignments. + - min_speakers (int): Minimum number of speakers expected in the audio. + - max_speakers (int): Maximum number of speakers expected in the audio. + - YOUR_HF_TOKEN (str): Your Hugging Face API token for model + authentication. + - model_name (str): Name of the speaker diarization model to be used + (default: "pyannote/speaker-diarization@2.1"). + + Returns: + - result_diarize (dict): Updated metadata after assigning speaker + labels to segments. + + Notes: + - This function utilizes a speaker diarization model to label speaker + segments in the audio. + - It assigns speakers to word-level segments based on diarization results. + - Cleans up memory by releasing resources after diarization. + - If only one speaker is specified, each segment is automatically assigned + as the first speaker, eliminating the need for diarization inference. + """ + + if max(min_speakers, max_speakers) > 1 and model_name: + try: + + diarize_model = whisperx.DiarizationPipeline( + model_name=model_name, + use_auth_token=YOUR_HF_TOKEN, + device=os.environ.get("SONITR_DEVICE"), + ) + + except Exception as error: + error_str = str(error) + gc.collect() + torch.cuda.empty_cache() # noqa + if "'NoneType' object has no attribute 'to'" in error_str: + if model_name == diarization_models["pyannote_2.1"]: + raise ValueError( + "Accept the license agreement for using Pyannote 2.1." + " You need to have an account on Hugging Face and " + "accept the license to use the models: " + "https://huggingface.co/pyannote/speaker-diarization " + "and https://huggingface.co/pyannote/segmentation " + "Get your KEY TOKEN here: " + "https://hf.co/settings/tokens " + ) + elif model_name == diarization_models["pyannote_3.1"]: + raise ValueError( + "New Licence Pyannote 3.1: You need to have an account" + " on Hugging Face and accept the license to use the " + "models: https://huggingface.co/pyannote/speaker-diarization-3.1 " # noqa + "and https://huggingface.co/pyannote/segmentation-3.0 " + ) + else: + raise error + + random_sleep() + diarize_segments = diarize_audio(diarize_model, audio_wav, min_speakers, max_speakers) + + result_diarize = whisperx.assign_word_speakers( + diarize_segments, result + ) + + for segment in result_diarize["segments"]: + if "speaker" not in segment: + segment["speaker"] = "SPEAKER_00" + logger.warning( + f"No speaker detected in {segment['start']}. First TTS " + f"will be used for the segment text: {segment['text']} " + ) + + del diarize_model + gc.collect() + torch.cuda.empty_cache() # noqa + else: + result_diarize = result + result_diarize["segments"] = [ + {**item, "speaker": "SPEAKER_00"} + for item in result_diarize["segments"] + ] + return reencode_speakers(result_diarize) diff --git a/soni_translate/text_multiformat_processor.py b/soni_translate/text_multiformat_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..fd58c602c984f4d2287b653d1c27446389202ceb --- /dev/null +++ b/soni_translate/text_multiformat_processor.py @@ -0,0 +1,987 @@ +from .logging_setup import logger +from whisperx.utils import get_writer +from .utils import remove_files, run_command, remove_directory_contents +from typing import List +import srt +import re +import os +import copy +import string +import soundfile as sf +from PIL import Image, ImageOps, ImageDraw, ImageFont + +punctuation_list = list( + string.punctuation + "¡¿«»„”“”‚‘’「」『』《》()【】〈〉〔〕〖〗〘〙〚〛⸤⸥⸨⸩" +) +symbol_list = punctuation_list + ["", "..", "..."] + + +def extract_from_srt(file_path): + with open(file_path, "r", encoding="utf-8") as file: + srt_content = file.read() + + subtitle_generator = srt.parse(srt_content) + srt_content_list = list(subtitle_generator) + + return srt_content_list + + +def clean_text(text): + + # Remove content within square brackets + text = re.sub(r'\[.*?\]', '', text) + # Add pattern to remove content within tags + text = re.sub(r'.*?', '', text) + # Remove HTML tags + text = re.sub(r'<.*?>', '', text) + # Remove "♫" and "♪" content + text = re.sub(r'♫.*?♫', '', text) + text = re.sub(r'♪.*?♪', '', text) + # Replace newline characters with an empty string + text = text.replace("\n", ". ") + # Remove double quotation marks + text = text.replace('"', '') + # Collapse multiple spaces and replace with a single space + text = re.sub(r"\s+", " ", text) + # Normalize spaces around periods + text = re.sub(r"[\s\.]+(?=\s)", ". ", text) + # Check if there are ♫ or ♪ symbols present + if '♫' in text or '♪' in text: + return "" + + text = text.strip() + + # Valid text + return text if text not in symbol_list else "" + + +def srt_file_to_segments(file_path, speaker=False): + try: + srt_content_list = extract_from_srt(file_path) + except Exception as error: + logger.error(str(error)) + fixed_file = "fixed_sub.srt" + remove_files(fixed_file) + fix_sub = f'ffmpeg -i "{file_path}" "{fixed_file}" -y' + run_command(fix_sub) + srt_content_list = extract_from_srt(fixed_file) + + segments = [] + for segment in srt_content_list: + + text = clean_text(str(segment.content)) + + if text: + segments.append( + { + "text": text, + "start": float(segment.start.total_seconds()), + "end": float(segment.end.total_seconds()), + } + ) + + if not segments: + raise Exception("No data found in srt subtitle file") + + if speaker: + segments = [{**seg, "speaker": "SPEAKER_00"} for seg in segments] + + return {"segments": segments} + + +# documents + + +def dehyphenate(lines: List[str], line_no: int) -> List[str]: + next_line = lines[line_no + 1] + word_suffix = next_line.split(" ")[0] + + lines[line_no] = lines[line_no][:-1] + word_suffix + lines[line_no + 1] = lines[line_no + 1][len(word_suffix):] + return lines + + +def remove_hyphens(text: str) -> str: + """ + + This fails for: + * Natural dashes: well-known, self-replication, use-cases, non-semantic, + Post-processing, Window-wise, viewpoint-dependent + * Trailing math operands: 2 - 4 + * Names: Lopez-Ferreras, VGG-19, CIFAR-100 + """ + lines = [line.rstrip() for line in text.split("\n")] + + # Find dashes + line_numbers = [] + for line_no, line in enumerate(lines[:-1]): + if line.endswith("-"): + line_numbers.append(line_no) + + # Replace + for line_no in line_numbers: + lines = dehyphenate(lines, line_no) + + return "\n".join(lines) + + +def pdf_to_txt(pdf_file, start_page, end_page): + from pypdf import PdfReader + + with open(pdf_file, "rb") as file: + reader = PdfReader(file) + logger.debug(f"Total pages: {reader.get_num_pages()}") + text = "" + + start_page_idx = max((start_page-1), 0) + end_page_inx = min((end_page), (reader.get_num_pages())) + document_pages = reader.pages[start_page_idx:end_page_inx] + logger.info( + f"Selected pages from {start_page_idx} to {end_page_inx}: " + f"{len(document_pages)}" + ) + + for page in document_pages: + text += remove_hyphens(page.extract_text()) + return text + + +def docx_to_txt(docx_file): + # https://github.com/AlJohri/docx2pdf update + from docx import Document + + doc = Document(docx_file) + text = "" + for paragraph in doc.paragraphs: + text += paragraph.text + "\n" + return text + + +def replace_multiple_elements(text, replacements): + pattern = re.compile("|".join(map(re.escape, replacements.keys()))) + replaced_text = pattern.sub( + lambda match: replacements[match.group(0)], text + ) + + # Remove multiple spaces + replaced_text = re.sub(r"\s+", " ", replaced_text) + + return replaced_text + + +def document_preprocessor(file_path, is_string, start_page, end_page): + if not is_string: + file_ext = os.path.splitext(file_path)[1].lower() + + if is_string: + text = file_path + elif file_ext == ".pdf": + text = pdf_to_txt(file_path, start_page, end_page) + elif file_ext == ".docx": + text = docx_to_txt(file_path) + elif file_ext == ".txt": + with open( + file_path, "r", encoding='utf-8', errors='replace' + ) as file: + text = file.read() + else: + raise Exception("Unsupported file format") + + # Add space to break segments more easily later + replacements = { + "、": "、 ", + "。": "。 ", + # "\n": " ", + } + text = replace_multiple_elements(text, replacements) + + # Save text to a .txt file + # file_name = os.path.splitext(os.path.basename(file_path))[0] + txt_file_path = "./text_preprocessor.txt" + + with open( + txt_file_path, "w", encoding='utf-8', errors='replace' + ) as txt_file: + txt_file.write(text) + + return txt_file_path, text + + +def split_text_into_chunks(text, chunk_size): + words = re.findall(r"\b\w+\b", text) + chunks = [] + current_chunk = "" + for word in words: + if ( + len(current_chunk) + len(word) + 1 <= chunk_size + ): # Adding 1 for the space between words + if current_chunk: + current_chunk += " " + current_chunk += word + else: + chunks.append(current_chunk) + current_chunk = word + if current_chunk: + chunks.append(current_chunk) + return chunks + + +def determine_chunk_size(file_name): + patterns = { + re.compile(r".*-(Male|Female)$"): 1024, # by character + re.compile(r".* BARK$"): 100, # t 64 256 + re.compile(r".* VITS$"): 500, + re.compile( + r".+\.(wav|mp3|ogg|m4a)$" + ): 150, # t 250 400 api automatic split + re.compile(r".* VITS-onnx$"): 250, # automatic sentence split + re.compile(r".* OpenAI-TTS$"): 1024 # max charaters 4096 + } + + for pattern, chunk_size in patterns.items(): + if pattern.match(file_name): + return chunk_size + + # Default chunk size if the file doesn't match any pattern; max 1800 + return 100 + + +def plain_text_to_segments(result_text=None, chunk_size=None): + if not chunk_size: + chunk_size = 100 + text_chunks = split_text_into_chunks(result_text, chunk_size) + + segments_chunks = [] + for num, chunk in enumerate(text_chunks): + chunk_dict = { + "text": chunk, + "start": (1.0 + num), + "end": (2.0 + num), + "speaker": "SPEAKER_00", + } + segments_chunks.append(chunk_dict) + + result_diarize = {"segments": segments_chunks} + + return result_diarize + + +def segments_to_plain_text(result_diarize): + complete_text = "" + for seg in result_diarize["segments"]: + complete_text += seg["text"] + " " # issue + + # Save text to a .txt file + # file_name = os.path.splitext(os.path.basename(file_path))[0] + txt_file_path = "./text_translation.txt" + + with open( + txt_file_path, "w", encoding='utf-8', errors='replace' + ) as txt_file: + txt_file.write(complete_text) + + return txt_file_path, complete_text + + +# doc to video + +COLORS = { + "black": (0, 0, 0), + "white": (255, 255, 255), + "red": (255, 0, 0), + "green": (0, 255, 0), + "blue": (0, 0, 255), + "yellow": (255, 255, 0), + "light_gray": (200, 200, 200), + "light_blue": (173, 216, 230), + "light_green": (144, 238, 144), + "light_yellow": (255, 255, 224), + "light_pink": (255, 182, 193), + "lavender": (230, 230, 250), + "peach": (255, 218, 185), + "light_cyan": (224, 255, 255), + "light_salmon": (255, 160, 122), + "light_green_yellow": (173, 255, 47), +} + +BORDER_COLORS = ["dynamic"] + list(COLORS.keys()) + + +def calculate_average_color(img): + # Resize the image to a small size for faster processing + img_small = img.resize((50, 50)) + # Calculate the average color + average_color = img_small.convert("RGB").resize((1, 1)).getpixel((0, 0)) + return average_color + + +def add_border_to_image( + image_path, + target_width, + target_height, + border_color=None +): + + img = Image.open(image_path) + + # Calculate the width and height for the new image with borders + original_width, original_height = img.size + original_aspect_ratio = original_width / original_height + target_aspect_ratio = target_width / target_height + + # Resize the image to fit the target resolution retaining aspect ratio + if original_aspect_ratio > target_aspect_ratio: + # Image is wider, calculate new height + new_height = int(target_width / original_aspect_ratio) + resized_img = img.resize((target_width, new_height)) + else: + # Image is taller, calculate new width + new_width = int(target_height * original_aspect_ratio) + resized_img = img.resize((new_width, target_height)) + + # Calculate padding for borders + padding = (0, 0, 0, 0) + if resized_img.size[0] != target_width or resized_img.size[1] != target_height: + if original_aspect_ratio > target_aspect_ratio: + # Add borders vertically + padding = (0, (target_height - resized_img.size[1]) // 2, 0, (target_height - resized_img.size[1]) // 2) + else: + # Add borders horizontally + padding = ((target_width - resized_img.size[0]) // 2, 0, (target_width - resized_img.size[0]) // 2, 0) + + # Add borders with specified color + if not border_color or border_color == "dynamic": + border_color = calculate_average_color(resized_img) + else: + border_color = COLORS.get(border_color, (0, 0, 0)) + + bordered_img = ImageOps.expand(resized_img, padding, fill=border_color) + + bordered_img.save(image_path) + + return image_path + + +def resize_and_position_subimage( + subimage, + max_width, + max_height, + subimage_position, + main_width, + main_height +): + subimage_width, subimage_height = subimage.size + + # Resize subimage if it exceeds maximum dimensions + if subimage_width > max_width or subimage_height > max_height: + # Calculate scaling factor + width_scale = max_width / subimage_width + height_scale = max_height / subimage_height + scale = min(width_scale, height_scale) + + # Resize subimage + subimage = subimage.resize( + (int(subimage_width * scale), int(subimage_height * scale)) + ) + + # Calculate position to place the subimage + if subimage_position == "top-left": + subimage_x = 0 + subimage_y = 0 + elif subimage_position == "top-right": + subimage_x = main_width - subimage.width + subimage_y = 0 + elif subimage_position == "bottom-left": + subimage_x = 0 + subimage_y = main_height - subimage.height + elif subimage_position == "bottom-right": + subimage_x = main_width - subimage.width + subimage_y = main_height - subimage.height + else: + raise ValueError( + "Invalid subimage_position. Choose from 'top-left', 'top-right'," + " 'bottom-left', or 'bottom-right'." + ) + + return subimage, subimage_x, subimage_y + + +def create_image_with_text_and_subimages( + text, + subimages, + width, + height, + text_color, + background_color, + output_file +): + # Create an image with the specified resolution and background color + image = Image.new('RGB', (width, height), color=background_color) + + # Initialize ImageDraw object + draw = ImageDraw.Draw(image) + + # Load a font + font = ImageFont.load_default() # You can specify your font file here + + # Calculate text size and position + text_bbox = draw.textbbox((0, 0), text, font=font) + text_width = text_bbox[2] - text_bbox[0] + text_height = text_bbox[3] - text_bbox[1] + text_x = (width - text_width) / 2 + text_y = (height - text_height) / 2 + + # Draw text on the image + draw.text((text_x, text_y), text, fill=text_color, font=font) + + # Paste subimages onto the main image + for subimage_path, subimage_position in subimages: + # Open the subimage + subimage = Image.open(subimage_path) + + # Convert subimage to RGBA mode if it doesn't have an alpha channel + if subimage.mode != 'RGBA': + subimage = subimage.convert('RGBA') + + # Resize and position the subimage + subimage, subimage_x, subimage_y = resize_and_position_subimage( + subimage, width / 4, height / 4, subimage_position, width, height + ) + + # Paste the subimage onto the main image + image.paste(subimage, (int(subimage_x), int(subimage_y)), subimage) + + image.save(output_file) + + return output_file + + +def doc_to_txtximg_pages( + document, + width, + height, + start_page, + end_page, + bcolor +): + from pypdf import PdfReader + + images_folder = "pdf_images/" + os.makedirs(images_folder, exist_ok=True) + remove_directory_contents(images_folder) + + # First image + text_image = os.path.basename(document)[:-4] + subimages = [("./assets/logo.jpeg", "top-left")] + text_color = (255, 255, 255) if bcolor == "black" else (0, 0, 0) # w|b + background_color = COLORS.get(bcolor, (255, 255, 255)) # dynamic white + first_image = "pdf_images/0000_00_aaa.png" + + create_image_with_text_and_subimages( + text_image, + subimages, + width, + height, + text_color, + background_color, + first_image + ) + + reader = PdfReader(document) + logger.debug(f"Total pages: {reader.get_num_pages()}") + + start_page_idx = max((start_page-1), 0) + end_page_inx = min((end_page), (reader.get_num_pages())) + document_pages = reader.pages[start_page_idx:end_page_inx] + + logger.info( + f"Selected pages from {start_page_idx} to {end_page_inx}: " + f"{len(document_pages)}" + ) + + data_doc = {} + for i, page in enumerate(document_pages): + + count = 0 + images = [] + for image_file_object in page.images: + img_name = f"{images_folder}{i:04d}_{count:02d}_{image_file_object.name}" + images.append(img_name) + with open(img_name, "wb") as fp: + fp.write(image_file_object.data) + count += 1 + img_name = add_border_to_image(img_name, width, height, bcolor) + + data_doc[i] = { + "text": remove_hyphens(page.extract_text()), + "images": images + } + + return data_doc + + +def page_data_to_segments(result_text=None, chunk_size=None): + + if not chunk_size: + chunk_size = 100 + + segments_chunks = [] + time_global = 0 + for page, result_data in result_text.items(): + # result_image = result_data["images"] + result_text = result_data["text"] + text_chunks = split_text_into_chunks(result_text, chunk_size) + if not text_chunks: + text_chunks = [" "] + + for chunk in text_chunks: + chunk_dict = { + "text": chunk, + "start": (1.0 + time_global), + "end": (2.0 + time_global), + "speaker": "SPEAKER_00", + "page": page, + } + segments_chunks.append(chunk_dict) + time_global += 1 + + result_diarize = {"segments": segments_chunks} + + return result_diarize + + +def update_page_data(result_diarize, doc_data): + complete_text = "" + current_page = result_diarize["segments"][0]["page"] + text_page = "" + + for seg in result_diarize["segments"]: + text = seg["text"] + " " # issue + complete_text += text + + page = seg["page"] + + if page == current_page: + text_page += text + else: + doc_data[current_page]["text"] = text_page + + # Next + text_page = text + current_page = page + + if doc_data[current_page]["text"] != text_page: + doc_data[current_page]["text"] = text_page + + return doc_data + + +def fix_timestamps_docs(result_diarize, audio_files): + current_start = 0.0 + + for seg, audio in zip(result_diarize["segments"], audio_files): + duration = round(sf.info(audio).duration, 2) + + seg["start"] = current_start + current_start += duration + seg["end"] = current_start + + return result_diarize + + +def create_video_from_images( + doc_data, + result_diarize +): + + # First image path + first_image = "pdf_images/0000_00_aaa.png" + + # Time segments and images + max_pages_idx = len(doc_data) - 1 + current_page = result_diarize["segments"][0]["page"] + duration_page = 0.0 + last_image = None + + for seg in result_diarize["segments"]: + start = seg["start"] + end = seg["end"] + duration_seg = end - start + + page = seg["page"] + + if page == current_page: + duration_page += duration_seg + else: + + images = doc_data[current_page]["images"] + + if first_image: + images = [first_image] + images + first_image = None + if not doc_data[min(max_pages_idx, (current_page+1))]["text"].strip(): + images = images + doc_data[min(max_pages_idx, (current_page+1))]["images"] + if not images and last_image: + images = [last_image] + + # Calculate images duration + time_duration_per_image = round((duration_page / len(images)), 2) + doc_data[current_page]["time_per_image"] = time_duration_per_image + + # Next values + doc_data[current_page]["images"] = images + last_image = images[-1] + duration_page = duration_seg + current_page = page + + if "time_per_image" not in doc_data[current_page].keys(): + images = doc_data[current_page]["images"] + if first_image: + images = [first_image] + images + if not images: + images = [last_image] + time_duration_per_image = round((duration_page / len(images)), 2) + doc_data[current_page]["time_per_image"] = time_duration_per_image + + # Timestamped image video. + with open("list.txt", "w") as file: + + for i, page in enumerate(doc_data.values()): + + duration = page["time_per_image"] + for img in page["images"]: + if i == len(doc_data) - 1 and img == page["images"][-1]: # Check if it's the last item + file.write(f"file {img}\n") + file.write(f"outpoint {duration}") + else: + file.write(f"file {img}\n") + file.write(f"outpoint {duration}\n") + + out_video = "video_from_images.mp4" + remove_files(out_video) + + cm = f"ffmpeg -y -f concat -i list.txt -c:v libx264 -preset veryfast -crf 18 -pix_fmt yuv420p {out_video}" + cm_alt = f"ffmpeg -f concat -i list.txt -c:v libx264 -r 30 -pix_fmt yuv420p -y {out_video}" + try: + run_command(cm) + except Exception as error: + logger.error(str(error)) + remove_files(out_video) + run_command(cm_alt) + + return out_video + + +def merge_video_and_audio(video_doc, final_wav_file): + + fixed_audio = "fixed_audio.mp3" + remove_files(fixed_audio) + cm = f"ffmpeg -i {final_wav_file} -c:a libmp3lame {fixed_audio}" + run_command(cm) + + vid_out = "video_book.mp4" + remove_files(vid_out) + cm = f"ffmpeg -i {video_doc} -i {fixed_audio} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {vid_out}" + run_command(cm) + + return vid_out + + +# subtitles + + +def get_subtitle( + language, + segments_data, + extension, + filename=None, + highlight_words=False, +): + if not filename: + filename = "task_subtitle" + + is_ass_extension = False + if extension == "ass": + is_ass_extension = True + extension = "srt" + + sub_file = filename + "." + extension + support_name = filename + ".mp3" + remove_files(sub_file) + + writer = get_writer(extension, output_dir=".") + word_options = { + "highlight_words": highlight_words, + "max_line_count": None, + "max_line_width": None, + } + + # Get data subs + subtitle_data = copy.deepcopy(segments_data) + subtitle_data["language"] = ( + "ja" if language in ["ja", "zh", "zh-TW"] else language + ) + + # Clean + if not highlight_words: + subtitle_data.pop("word_segments", None) + for segment in subtitle_data["segments"]: + for key in ["speaker", "chars", "words"]: + segment.pop(key, None) + + writer( + subtitle_data, + support_name, + word_options, + ) + + if is_ass_extension: + temp_name = filename + ".ass" + remove_files(temp_name) + convert_sub = f'ffmpeg -i "{sub_file}" "{temp_name}" -y' + run_command(convert_sub) + sub_file = temp_name + + return sub_file + + +def process_subtitles( + deep_copied_result, + align_language, + result_diarize, + output_format_subtitle, + TRANSLATE_AUDIO_TO, +): + name_ori = "sub_ori." + name_tra = "sub_tra." + remove_files( + [name_ori + output_format_subtitle, name_tra + output_format_subtitle] + ) + + writer = get_writer(output_format_subtitle, output_dir=".") + word_options = { + "highlight_words": False, + "max_line_count": None, + "max_line_width": None, + } + + # original lang + subs_copy_result = copy.deepcopy(deep_copied_result) + subs_copy_result["language"] = ( + "zh" if align_language == "zh-TW" else align_language + ) + for segment in subs_copy_result["segments"]: + segment.pop("speaker", None) + + try: + writer( + subs_copy_result, + name_ori[:-1] + ".mp3", + word_options, + ) + except Exception as error: + logger.error(str(error)) + if str(error) == "list indices must be integers or slices, not str": + logger.error( + "Related to poor word segmentation" + " in segments after alignment." + ) + subs_copy_result["segments"][0].pop("words") + writer( + subs_copy_result, + name_ori[:-1] + ".mp3", + word_options, + ) + + # translated lang + subs_tra_copy_result = copy.deepcopy(result_diarize) + subs_tra_copy_result["language"] = ( + "ja" if TRANSLATE_AUDIO_TO in ["ja", "zh", "zh-TW"] else align_language + ) + subs_tra_copy_result.pop("word_segments", None) + for segment in subs_tra_copy_result["segments"]: + for key in ["speaker", "chars", "words"]: + segment.pop(key, None) + + writer( + subs_tra_copy_result, + name_tra[:-1] + ".mp3", + word_options, + ) + + return name_tra + output_format_subtitle + + +def linguistic_level_segments( + result_base, + linguistic_unit="word", # word or char +): + linguistic_unit = linguistic_unit[:4] + linguistic_unit_key = linguistic_unit + "s" + result = copy.deepcopy(result_base) + + if linguistic_unit_key not in result["segments"][0].keys(): + raise ValueError("No alignment detected, can't process") + + segments_by_unit = [] + for segment in result["segments"]: + segment_units = segment[linguistic_unit_key] + # segment_speaker = segment.get("speaker", "SPEAKER_00") + + for unit in segment_units: + + text = unit[linguistic_unit] + + if "start" in unit.keys(): + segments_by_unit.append( + { + "start": unit["start"], + "end": unit["end"], + "text": text, + # "speaker": segment_speaker, + } + ) + elif not segments_by_unit: + pass + else: + segments_by_unit[-1]["text"] += text + + return {"segments": segments_by_unit} + + +def break_aling_segments( + result: dict, + break_characters: str = "", # ":|,|.|" +): + result_align = copy.deepcopy(result) + + break_characters_list = break_characters.split("|") + break_characters_list = [i for i in break_characters_list if i != ''] + + if not break_characters_list: + logger.info("No valid break characters were specified.") + return result + + logger.info(f"Redivide text segments by: {str(break_characters_list)}") + + # create new with filters + normal = [] + + def process_chars(chars, letter_new_start, num, text): + start_key, end_key = "start", "end" + start_value = end_value = None + + for char in chars: + if start_key in char: + start_value = char[start_key] + break + + for char in reversed(chars): + if end_key in char: + end_value = char[end_key] + break + + if not start_value or not end_value: + raise Exception( + f"Unable to obtain a valid timestamp for chars: {str(chars)}" + ) + + return { + "start": start_value, + "end": end_value, + "text": text, + "words": chars, + } + + for i, segment in enumerate(result_align['segments']): + + logger.debug(f"- Process segment: {i}, text: {segment['text']}") + # start = segment['start'] + letter_new_start = 0 + for num, char in enumerate(segment['chars']): + + if char["char"] is None: + continue + + # if "start" in char: + # start = char["start"] + + # if "end" in char: + # end = char["end"] + + # Break by character + if char['char'] in break_characters_list: + + text = segment['text'][letter_new_start:num+1] + + logger.debug( + f"Break in: {char['char']}, position: {num}, text: {text}" + ) + + chars = segment['chars'][letter_new_start:num+1] + + if not text: + logger.debug("No text") + continue + + if num == 0 and not text.strip(): + logger.debug("blank space in start") + continue + + if len(text) == 1: + logger.debug(f"Short char append, num: {num}") + normal[-1]["text"] += text + normal[-1]["words"].append(chars) + continue + + # logger.debug(chars) + normal_dict = process_chars(chars, letter_new_start, num, text) + + letter_new_start = num+1 + + normal.append(normal_dict) + + # If we reach the end of the segment, add the last part of chars. + if num == len(segment["chars"]) - 1: + + text = segment['text'][letter_new_start:num+1] + + # If remain text len is not default len text + if num not in [len(text)-1, len(text)] and text: + logger.debug(f'Remaining text: {text}') + + if not text: + logger.debug("No remaining text.") + continue + + if len(text) == 1: + logger.debug(f"Short char append, num: {num}") + normal[-1]["text"] += text + normal[-1]["words"].append(chars) + continue + + chars = segment['chars'][letter_new_start:num+1] + + normal_dict = process_chars(chars, letter_new_start, num, text) + + letter_new_start = num+1 + + normal.append(normal_dict) + + # Rename char to word + for item in normal: + words_list = item['words'] + for word_item in words_list: + if 'char' in word_item: + word_item['word'] = word_item.pop('char') + + # Convert to dict default + break_segments = {"segments": normal} + + msg_count = ( + f"Segment count before: {len(result['segments'])}, " + f"after: {len(break_segments['segments'])}." + ) + logger.info(msg_count) + + return break_segments diff --git a/soni_translate/text_to_speech.py b/soni_translate/text_to_speech.py new file mode 100644 index 0000000000000000000000000000000000000000..07afd356c35f75ab74f6db779e4f3d166ec82480 --- /dev/null +++ b/soni_translate/text_to_speech.py @@ -0,0 +1,1574 @@ +from gtts import gTTS +import edge_tts, asyncio, json, glob # noqa +from tqdm import tqdm +import librosa, os, re, torch, gc, subprocess # noqa +from .language_configuration import ( + fix_code_language, + BARK_VOICES_LIST, + VITS_VOICES_LIST, +) +from .utils import ( + download_manager, + create_directories, + copy_files, + rename_file, + remove_directory_contents, + remove_files, + run_command, +) +import numpy as np +from typing import Any, Dict +from pathlib import Path +import soundfile as sf +import platform +import logging +import traceback +from .logging_setup import logger + + +class TTS_OperationError(Exception): + def __init__(self, message="The operation did not complete successfully."): + self.message = message + super().__init__(self.message) + + +def verify_saved_file_and_size(filename): + if not os.path.exists(filename): + raise TTS_OperationError(f"File '{filename}' was not saved.") + if os.path.getsize(filename) == 0: + raise TTS_OperationError( + f"File '{filename}' has a zero size. " + "Related to incorrect TTS for the target language" + ) + + +def error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename): + traceback.print_exc() + logger.error(f"Error: {str(error)}") + try: + from tempfile import TemporaryFile + + tts = gTTS(segment["text"], lang=fix_code_language(TRANSLATE_AUDIO_TO)) + # tts.save(filename) + f = TemporaryFile() + tts.write_to_fp(f) + + # Reset the file pointer to the beginning of the file + f.seek(0) + + # Read audio data from the TemporaryFile using soundfile + audio_data, samplerate = sf.read(f) + f.close() # Close the TemporaryFile + sf.write( + filename, audio_data, samplerate, format="ogg", subtype="vorbis" + ) + + logger.warning( + 'TTS auxiliary will be utilized ' + f'rather than TTS: {segment["tts_name"]}' + ) + verify_saved_file_and_size(filename) + except Exception as error: + logger.critical(f"Error: {str(error)}") + sample_rate_aux = 22050 + duration = float(segment["end"]) - float(segment["start"]) + data = np.zeros(int(sample_rate_aux * duration)).astype(np.float32) + sf.write( + filename, data, sample_rate_aux, format="ogg", subtype="vorbis" + ) + logger.error("Audio will be replaced -> [silent audio].") + verify_saved_file_and_size(filename) + + +def pad_array(array, sr): + + if isinstance(array, list): + array = np.array(array) + + if not array.shape[0]: + raise ValueError("The generated audio does not contain any data") + + valid_indices = np.where(np.abs(array) > 0.001)[0] + + if len(valid_indices) == 0: + logger.debug(f"No valid indices: {array}") + return array + + try: + pad_indice = int(0.1 * sr) + start_pad = max(0, valid_indices[0] - pad_indice) + end_pad = min(len(array), valid_indices[-1] + 1 + pad_indice) + padded_array = array[start_pad:end_pad] + return padded_array + except Exception as error: + logger.error(str(error)) + return array + + +# ===================================== +# EDGE TTS +# ===================================== + + +def edge_tts_voices_list(): + try: + completed_process = subprocess.run( + ["edge-tts", "--list-voices"], capture_output=True, text=True + ) + lines = completed_process.stdout.strip().split("\n") + except Exception as error: + logger.debug(str(error)) + lines = [] + + voices = [] + for line in lines: + if line.startswith("Name: "): + voice_entry = {} + voice_entry["Name"] = line.split(": ")[1] + elif line.startswith("Gender: "): + voice_entry["Gender"] = line.split(": ")[1] + voices.append(voice_entry) + + formatted_voices = [ + f"{entry['Name']}-{entry['Gender']}" for entry in voices + ] + + if not formatted_voices: + logger.warning( + "The list of Edge TTS voices could not be obtained, " + "switching to an alternative method" + ) + tts_voice_list = asyncio.new_event_loop().run_until_complete( + edge_tts.list_voices() + ) + formatted_voices = sorted( + [f"{v['ShortName']}-{v['Gender']}" for v in tts_voice_list] + ) + + if not formatted_voices: + logger.error("Can't get EDGE TTS - list voices") + + return formatted_voices + + +def segments_egde_tts(filtered_edge_segments, TRANSLATE_AUDIO_TO, is_gui): + for segment in tqdm(filtered_edge_segments["segments"]): + speaker = segment["speaker"] # noqa + text = segment["text"] + start = segment["start"] + tts_name = segment["tts_name"] + + # make the tts audio + filename = f"audio/{start}.ogg" + temp_file = filename[:-3] + "mp3" + + logger.info(f"{text} >> {filename}") + try: + if is_gui: + asyncio.run( + edge_tts.Communicate( + text, "-".join(tts_name.split("-")[:-1]) + ).save(temp_file) + ) + else: + # nest_asyncio.apply() if not is_gui else None + command = f'edge-tts -t "{text}" -v "{tts_name.replace("-Male", "").replace("-Female", "")}" --write-media "{temp_file}"' + run_command(command) + verify_saved_file_and_size(temp_file) + + data, sample_rate = sf.read(temp_file) + data = pad_array(data, sample_rate) + # os.remove(temp_file) + + # Save file + sf.write( + file=filename, + samplerate=sample_rate, + data=data, + format="ogg", + subtype="vorbis", + ) + verify_saved_file_and_size(filename) + + except Exception as error: + error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) + + +# ===================================== +# BARK TTS +# ===================================== + + +def segments_bark_tts( + filtered_bark_segments, TRANSLATE_AUDIO_TO, model_id_bark="suno/bark-small" +): + from transformers import AutoProcessor, BarkModel + from optimum.bettertransformer import BetterTransformer + + device = os.environ.get("SONITR_DEVICE") + torch_dtype_env = torch.float16 if device == "cuda" else torch.float32 + + # load model bark + model = BarkModel.from_pretrained( + model_id_bark, torch_dtype=torch_dtype_env + ).to(device) + model = model.to(device) + processor = AutoProcessor.from_pretrained( + model_id_bark, return_tensors="pt" + ) # , padding=True + if device == "cuda": + # convert to bettertransformer + model = BetterTransformer.transform(model, keep_original_model=False) + # enable CPU offload + # model.enable_cpu_offload() + sampling_rate = model.generation_config.sample_rate + + # filtered_segments = filtered_bark_segments['segments'] + # Sorting the segments by 'tts_name' + # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name']) + # logger.debug(sorted_segments) + + for segment in tqdm(filtered_bark_segments["segments"]): + speaker = segment["speaker"] # noqa + text = segment["text"] + start = segment["start"] + tts_name = segment["tts_name"] + + inputs = processor(text, voice_preset=BARK_VOICES_LIST[tts_name]).to( + device + ) + + # make the tts audio + filename = f"audio/{start}.ogg" + logger.info(f"{text} >> {filename}") + try: + # Infer + with torch.inference_mode(): + speech_output = model.generate( + **inputs, + do_sample=True, + fine_temperature=0.4, + coarse_temperature=0.8, + pad_token_id=processor.tokenizer.pad_token_id, + ) + # Save file + data_tts = pad_array( + speech_output.cpu().numpy().squeeze().astype(np.float32), + sampling_rate, + ) + sf.write( + file=filename, + samplerate=sampling_rate, + data=data_tts, + format="ogg", + subtype="vorbis", + ) + verify_saved_file_and_size(filename) + except Exception as error: + error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) + gc.collect() + torch.cuda.empty_cache() + try: + del processor + del model + gc.collect() + torch.cuda.empty_cache() + except Exception as error: + logger.error(str(error)) + gc.collect() + torch.cuda.empty_cache() + + +# ===================================== +# VITS TTS +# ===================================== + + +def uromanize(input_string): + """Convert non-Roman strings to Roman using the `uroman` perl package.""" + # script_path = os.path.join(uroman_path, "bin", "uroman.pl") + + if not os.path.exists("./uroman"): + logger.info( + "Clonning repository uroman https://github.com/isi-nlp/uroman.git" + " for romanize the text" + ) + process = subprocess.Popen( + ["git", "clone", "https://github.com/isi-nlp/uroman.git"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = process.communicate() + script_path = os.path.join("./uroman", "bin", "uroman.pl") + + command = ["perl", script_path] + + process = subprocess.Popen( + command, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + # Execute the perl command + stdout, stderr = process.communicate(input=input_string.encode()) + + if process.returncode != 0: + raise ValueError(f"Error {process.returncode}: {stderr.decode()}") + + # Return the output as a string and skip the new-line character at the end + return stdout.decode()[:-1] + + +def segments_vits_tts(filtered_vits_segments, TRANSLATE_AUDIO_TO): + from transformers import VitsModel, AutoTokenizer + + filtered_segments = filtered_vits_segments["segments"] + # Sorting the segments by 'tts_name' + sorted_segments = sorted(filtered_segments, key=lambda x: x["tts_name"]) + logger.debug(sorted_segments) + + model_name_key = None + for segment in tqdm(sorted_segments): + speaker = segment["speaker"] # noqa + text = segment["text"] + start = segment["start"] + tts_name = segment["tts_name"] + + if tts_name != model_name_key: + model_name_key = tts_name + model = VitsModel.from_pretrained(VITS_VOICES_LIST[tts_name]) + tokenizer = AutoTokenizer.from_pretrained( + VITS_VOICES_LIST[tts_name] + ) + sampling_rate = model.config.sampling_rate + + if tokenizer.is_uroman: + romanize_text = uromanize(text) + logger.debug(f"Romanize text: {romanize_text}") + inputs = tokenizer(romanize_text, return_tensors="pt") + else: + inputs = tokenizer(text, return_tensors="pt") + + # make the tts audio + filename = f"audio/{start}.ogg" + logger.info(f"{text} >> {filename}") + try: + # Infer + with torch.no_grad(): + speech_output = model(**inputs).waveform + + data_tts = pad_array( + speech_output.cpu().numpy().squeeze().astype(np.float32), + sampling_rate, + ) + # Save file + sf.write( + file=filename, + samplerate=sampling_rate, + data=data_tts, + format="ogg", + subtype="vorbis", + ) + verify_saved_file_and_size(filename) + except Exception as error: + error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) + gc.collect() + torch.cuda.empty_cache() + try: + del tokenizer + del model + gc.collect() + torch.cuda.empty_cache() + except Exception as error: + logger.error(str(error)) + gc.collect() + torch.cuda.empty_cache() + + +# ===================================== +# Coqui XTTS +# ===================================== + + +def coqui_xtts_voices_list(): + main_folder = "_XTTS_" + pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$") + pattern_automatic_speaker = re.compile(r"AUTOMATIC_SPEAKER_\d+\.wav$") + + # List only files in the directory matching the pattern but not matching + # AUTOMATIC_SPEAKER_00.wav, AUTOMATIC_SPEAKER_01.wav, etc. + wav_voices = [ + "_XTTS_/" + f + for f in os.listdir(main_folder) + if os.path.isfile(os.path.join(main_folder, f)) + and pattern_coqui.match(f) + and not pattern_automatic_speaker.match(f) + ] + + return ["_XTTS_/AUTOMATIC.wav"] + wav_voices + + +def seconds_to_hhmmss_ms(seconds): + hours = seconds // 3600 + minutes = (seconds % 3600) // 60 + seconds = seconds % 60 + milliseconds = int((seconds - int(seconds)) * 1000) + return "%02d:%02d:%02d.%03d" % (hours, minutes, int(seconds), milliseconds) + + +def audio_trimming(audio_path, destination, start, end): + if isinstance(start, (int, float)): + start = seconds_to_hhmmss_ms(start) + if isinstance(end, (int, float)): + end = seconds_to_hhmmss_ms(end) + + if destination: + file_directory = destination + else: + file_directory = os.path.dirname(audio_path) + + file_name = os.path.splitext(os.path.basename(audio_path))[0] + file_ = f"{file_name}_trim.wav" + # file_ = f'{os.path.splitext(audio_path)[0]}_trim.wav' + output_path = os.path.join(file_directory, file_) + + # -t (duration from -ss) | -to (time stop) | -af silenceremove=1:0:-50dB (remove silence) + command = f'ffmpeg -y -loglevel error -i "{audio_path}" -ss {start} -to {end} -acodec pcm_s16le -f wav "{output_path}"' + run_command(command) + + return output_path + + +def convert_to_xtts_good_sample(audio_path: str = "", destination: str = ""): + if destination: + file_directory = destination + else: + file_directory = os.path.dirname(audio_path) + + file_name = os.path.splitext(os.path.basename(audio_path))[0] + file_ = f"{file_name}_good_sample.wav" + # file_ = f'{os.path.splitext(audio_path)[0]}_good_sample.wav' + mono_path = os.path.join(file_directory, file_) # get root + + command = f'ffmpeg -y -loglevel error -i "{audio_path}" -ac 1 -ar 22050 -sample_fmt s16 -f wav "{mono_path}"' + run_command(command) + + return mono_path + + +def sanitize_file_name(file_name): + import unicodedata + + # Normalize the string to NFKD form to separate combined characters into + # base characters and diacritics + normalized_name = unicodedata.normalize("NFKD", file_name) + # Replace any non-ASCII characters or special symbols with an underscore + sanitized_name = re.sub(r"[^\w\s.-]", "_", normalized_name) + return sanitized_name + + +def create_wav_file_vc( + sample_name="", # name final file + audio_wav="", # path + start=None, # trim start + end=None, # trim end + output_final_path="_XTTS_", + get_vocals_dereverb=True, +): + sample_name = sample_name if sample_name else "default_name" + sample_name = sanitize_file_name(sample_name) + audio_wav = audio_wav if isinstance(audio_wav, str) else audio_wav.name + + BASE_DIR = ( + "." # os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + ) + + output_dir = os.path.join(BASE_DIR, "clean_song_output") # remove content + # remove_directory_contents(output_dir) + + if start or end: + # Cut file + audio_segment = audio_trimming(audio_wav, output_dir, start, end) + else: + # Complete file + audio_segment = audio_wav + + from .mdx_net import process_uvr_task + + try: + _, _, _, _, audio_segment = process_uvr_task( + orig_song_path=audio_segment, + main_vocals=True, + dereverb=get_vocals_dereverb, + ) + except Exception as error: + logger.error(str(error)) + + sample = convert_to_xtts_good_sample(audio_segment) + + sample_name = f"{sample_name}.wav" + sample_rename = rename_file(sample, sample_name) + + copy_files(sample_rename, output_final_path) + + final_sample = os.path.join(output_final_path, sample_name) + if os.path.exists(final_sample): + logger.info(final_sample) + return final_sample + else: + raise Exception(f"Error wav: {final_sample}") + + +def create_new_files_for_vc( + speakers_coqui, + segments_base, + dereverb_automatic=True +): + # before function delete automatic delete_previous_automatic + output_dir = os.path.join(".", "clean_song_output") # remove content + remove_directory_contents(output_dir) + + for speaker in speakers_coqui: + filtered_speaker = [ + segment + for segment in segments_base + if segment["speaker"] == speaker + ] + if len(filtered_speaker) > 4: + filtered_speaker = filtered_speaker[1:] + if filtered_speaker[0]["tts_name"] == "_XTTS_/AUTOMATIC.wav": + name_automatic_wav = f"AUTOMATIC_{speaker}" + if os.path.exists(f"_XTTS_/{name_automatic_wav}.wav"): + logger.info(f"WAV automatic {speaker} exists") + # path_wav = path_automatic_wav + pass + else: + # create wav + wav_ok = False + for seg in filtered_speaker: + duration = float(seg["end"]) - float(seg["start"]) + if duration > 7.0 and duration < 12.0: + logger.info( + f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {duration}, {seg["text"]}' + ) + create_wav_file_vc( + sample_name=name_automatic_wav, + audio_wav="audio.wav", + start=(float(seg["start"]) + 1.0), + end=(float(seg["end"]) - 1.0), + get_vocals_dereverb=dereverb_automatic, + ) + wav_ok = True + break + + if not wav_ok: + logger.info("Taking the first segment") + seg = filtered_speaker[0] + logger.info( + f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {seg["text"]}' + ) + max_duration = float(seg["end"]) - float(seg["start"]) + max_duration = max(2.0, min(max_duration, 9.0)) + + create_wav_file_vc( + sample_name=name_automatic_wav, + audio_wav="audio.wav", + start=(float(seg["start"])), + end=(float(seg["start"]) + max_duration), + get_vocals_dereverb=dereverb_automatic, + ) + + +def segments_coqui_tts( + filtered_coqui_segments, + TRANSLATE_AUDIO_TO, + model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2", + speakers_coqui=None, + delete_previous_automatic=True, + dereverb_automatic=True, + emotion=None, +): + """XTTS + Install: + pip install -q TTS==0.21.1 + pip install -q numpy==1.23.5 + + Notes: + - tts_name is the wav|mp3|ogg|m4a file for VC + """ + from TTS.api import TTS + + TRANSLATE_AUDIO_TO = fix_code_language(TRANSLATE_AUDIO_TO, syntax="coqui") + supported_lang_coqui = [ + "zh-cn", + "en", + "fr", + "de", + "it", + "pt", + "pl", + "tr", + "ru", + "nl", + "cs", + "ar", + "es", + "hu", + "ko", + "ja", + ] + if TRANSLATE_AUDIO_TO not in supported_lang_coqui: + raise TTS_OperationError( + f"'{TRANSLATE_AUDIO_TO}' is not a supported language for Coqui XTTS" + ) + # Emotion and speed can only be used with Coqui Studio models. discontinued + # emotions = ["Neutral", "Happy", "Sad", "Angry", "Dull"] + + if delete_previous_automatic: + for spk in speakers_coqui: + remove_files(f"_XTTS_/AUTOMATIC_{spk}.wav") + + directory_audios_vc = "_XTTS_" + create_directories(directory_audios_vc) + create_new_files_for_vc( + speakers_coqui, + filtered_coqui_segments["segments"], + dereverb_automatic, + ) + + # Init TTS + device = os.environ.get("SONITR_DEVICE") + model = TTS(model_id_coqui).to(device) + sampling_rate = 24000 + + # filtered_segments = filtered_coqui_segments['segments'] + # Sorting the segments by 'tts_name' + # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name']) + # logger.debug(sorted_segments) + + for segment in tqdm(filtered_coqui_segments["segments"]): + speaker = segment["speaker"] + text = segment["text"] + start = segment["start"] + tts_name = segment["tts_name"] + if tts_name == "_XTTS_/AUTOMATIC.wav": + tts_name = f"_XTTS_/AUTOMATIC_{speaker}.wav" + + # make the tts audio + filename = f"audio/{start}.ogg" + logger.info(f"{text} >> {filename}") + try: + # Infer + wav = model.tts( + text=text, speaker_wav=tts_name, language=TRANSLATE_AUDIO_TO + ) + data_tts = pad_array( + wav, + sampling_rate, + ) + # Save file + sf.write( + file=filename, + samplerate=sampling_rate, + data=data_tts, + format="ogg", + subtype="vorbis", + ) + verify_saved_file_and_size(filename) + except Exception as error: + error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) + gc.collect() + torch.cuda.empty_cache() + try: + del model + gc.collect() + torch.cuda.empty_cache() + except Exception as error: + logger.error(str(error)) + gc.collect() + torch.cuda.empty_cache() + + +# ===================================== +# PIPER TTS +# ===================================== + + +def piper_tts_voices_list(): + file_path = download_manager( + url="https://huggingface.co/rhasspy/piper-voices/resolve/main/voices.json", + path="./PIPER_MODELS", + ) + + with open(file_path, "r", encoding="utf8") as file: + data = json.load(file) + piper_id_models = [key + " VITS-onnx" for key in data.keys()] + + return piper_id_models + + +def replace_text_in_json(file_path, key_to_replace, new_text, condition=None): + # Read the JSON file + with open(file_path, "r", encoding="utf-8") as file: + data = json.load(file) + + # Modify the specified key's value with the new text + if key_to_replace in data: + if condition: + value_condition = condition + else: + value_condition = data[key_to_replace] + + if data[key_to_replace] == value_condition: + data[key_to_replace] = new_text + + # Write the modified content back to the JSON file + with open(file_path, "w") as file: + json.dump( + data, file, indent=2 + ) # Write the modified data back to the file with indentation for readability + + +def load_piper_model( + model: str, + data_dir: list, + download_dir: str = "", + update_voices: bool = False, +): + from piper import PiperVoice + from piper.download import ensure_voice_exists, find_voice, get_voices + + try: + import onnxruntime as rt + + if rt.get_device() == "GPU" and os.environ.get("SONITR_DEVICE") == "cuda": + logger.debug("onnxruntime device > GPU") + cuda = True + else: + logger.info( + "onnxruntime device > CPU" + ) # try pip install onnxruntime-gpu + cuda = False + except Exception as error: + raise TTS_OperationError(f"onnxruntime error: {str(error)}") + + # Disable CUDA in Windows + if platform.system() == "Windows": + logger.info("Employing CPU exclusivity with Piper TTS") + cuda = False + + if not download_dir: + # Download to first data directory by default + download_dir = data_dir[0] + else: + data_dir = [os.path.join(data_dir[0], download_dir)] + + # Download voice if file doesn't exist + model_path = Path(model) + if not model_path.exists(): + # Load voice info + voices_info = get_voices(download_dir, update_voices=update_voices) + + # Resolve aliases for backwards compatibility with old voice names + aliases_info: Dict[str, Any] = {} + for voice_info in voices_info.values(): + for voice_alias in voice_info.get("aliases", []): + aliases_info[voice_alias] = {"_is_alias": True, **voice_info} + + voices_info.update(aliases_info) + ensure_voice_exists(model, data_dir, download_dir, voices_info) + model, config = find_voice(model, data_dir) + + replace_text_in_json( + config, "phoneme_type", "espeak", "PhonemeType.ESPEAK" + ) + + # Load voice + voice = PiperVoice.load(model, config_path=config, use_cuda=cuda) + + return voice + + +def synthesize_text_to_audio_np_array(voice, text, synthesize_args): + audio_stream = voice.synthesize_stream_raw(text, **synthesize_args) + + # Collect the audio bytes into a single NumPy array + audio_data = b"" + for audio_bytes in audio_stream: + audio_data += audio_bytes + + # Ensure correct data type and convert audio bytes to NumPy array + audio_np = np.frombuffer(audio_data, dtype=np.int16) + return audio_np + + +def segments_vits_onnx_tts(filtered_onnx_vits_segments, TRANSLATE_AUDIO_TO): + """ + Install: + pip install -q piper-tts==1.2.0 onnxruntime-gpu # for cuda118 + """ + + data_dir = [ + str(Path.cwd()) + ] # "Data directory to check for downloaded models (default: current directory)" + download_dir = "PIPER_MODELS" + # model_name = "en_US-lessac-medium" tts_name in a dict like VITS + update_voices = True # "Download latest voices.json during startup", + + synthesize_args = { + "speaker_id": None, + "length_scale": 1.0, + "noise_scale": 0.667, + "noise_w": 0.8, + "sentence_silence": 0.0, + } + + filtered_segments = filtered_onnx_vits_segments["segments"] + # Sorting the segments by 'tts_name' + sorted_segments = sorted(filtered_segments, key=lambda x: x["tts_name"]) + logger.debug(sorted_segments) + + model_name_key = None + for segment in tqdm(sorted_segments): + speaker = segment["speaker"] # noqa + text = segment["text"] + start = segment["start"] + tts_name = segment["tts_name"].replace(" VITS-onnx", "") + + if tts_name != model_name_key: + model_name_key = tts_name + model = load_piper_model( + tts_name, data_dir, download_dir, update_voices + ) + sampling_rate = model.config.sample_rate + + # make the tts audio + filename = f"audio/{start}.ogg" + logger.info(f"{text} >> {filename}") + try: + # Infer + speech_output = synthesize_text_to_audio_np_array( + model, text, synthesize_args + ) + data_tts = pad_array( + speech_output, # .cpu().numpy().squeeze().astype(np.float32), + sampling_rate, + ) + # Save file + sf.write( + file=filename, + samplerate=sampling_rate, + data=data_tts, + format="ogg", + subtype="vorbis", + ) + verify_saved_file_and_size(filename) + except Exception as error: + error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) + gc.collect() + torch.cuda.empty_cache() + try: + del model + gc.collect() + torch.cuda.empty_cache() + except Exception as error: + logger.error(str(error)) + gc.collect() + torch.cuda.empty_cache() + + +# ===================================== +# CLOSEAI TTS +# ===================================== + + +def segments_openai_tts( + filtered_openai_tts_segments, TRANSLATE_AUDIO_TO +): + from openai import OpenAI + + client = OpenAI() + sampling_rate = 24000 + + # filtered_segments = filtered_openai_tts_segments['segments'] + # Sorting the segments by 'tts_name' + # sorted_segments = sorted(filtered_segments, key=lambda x: x['tts_name']) + + for segment in tqdm(filtered_openai_tts_segments["segments"]): + speaker = segment["speaker"] # noqa + text = segment["text"].strip() + start = segment["start"] + tts_name = segment["tts_name"] + + # make the tts audio + filename = f"audio/{start}.ogg" + logger.info(f"{text} >> {filename}") + + try: + # Request + response = client.audio.speech.create( + model="tts-1-hd" if "HD" in tts_name else "tts-1", + voice=tts_name.split()[0][1:], + response_format="wav", + input=text + ) + + audio_bytes = b'' + for data in response.iter_bytes(chunk_size=4096): + audio_bytes += data + + speech_output = np.frombuffer(audio_bytes, dtype=np.int16) + + # Save file + data_tts = pad_array( + speech_output[240:], + sampling_rate, + ) + + sf.write( + file=filename, + samplerate=sampling_rate, + data=data_tts, + format="ogg", + subtype="vorbis", + ) + verify_saved_file_and_size(filename) + + except Exception as error: + error_handling_in_tts(error, segment, TRANSLATE_AUDIO_TO, filename) + + +# ===================================== +# Select task TTS +# ===================================== + + +def find_spkr(pattern, speaker_to_voice, segments): + return [ + speaker + for speaker, voice in speaker_to_voice.items() + if pattern.match(voice) and any( + segment["speaker"] == speaker for segment in segments + ) + ] + + +def filter_by_speaker(speakers, segments): + return { + "segments": [ + segment + for segment in segments + if segment["speaker"] in speakers + ] + } + + +def audio_segmentation_to_voice( + result_diarize, + TRANSLATE_AUDIO_TO, + is_gui, + tts_voice00, + tts_voice01="", + tts_voice02="", + tts_voice03="", + tts_voice04="", + tts_voice05="", + tts_voice06="", + tts_voice07="", + tts_voice08="", + tts_voice09="", + tts_voice10="", + tts_voice11="", + dereverb_automatic=True, + model_id_bark="suno/bark-small", + model_id_coqui="tts_models/multilingual/multi-dataset/xtts_v2", + delete_previous_automatic=True, +): + + remove_directory_contents("audio") + + # Mapping speakers to voice variables + speaker_to_voice = { + "SPEAKER_00": tts_voice00, + "SPEAKER_01": tts_voice01, + "SPEAKER_02": tts_voice02, + "SPEAKER_03": tts_voice03, + "SPEAKER_04": tts_voice04, + "SPEAKER_05": tts_voice05, + "SPEAKER_06": tts_voice06, + "SPEAKER_07": tts_voice07, + "SPEAKER_08": tts_voice08, + "SPEAKER_09": tts_voice09, + "SPEAKER_10": tts_voice10, + "SPEAKER_11": tts_voice11, + } + + # Assign 'SPEAKER_00' to segments without a 'speaker' key + for segment in result_diarize["segments"]: + if "speaker" not in segment: + segment["speaker"] = "SPEAKER_00" + logger.warning( + "NO SPEAKER DETECT IN SEGMENT: First TTS will be used in the" + f" segment time {segment['start'], segment['text']}" + ) + # Assign the TTS name + segment["tts_name"] = speaker_to_voice[segment["speaker"]] + + # Find TTS method + pattern_edge = re.compile(r".*-(Male|Female)$") + pattern_bark = re.compile(r".* BARK$") + pattern_vits = re.compile(r".* VITS$") + pattern_coqui = re.compile(r".+\.(wav|mp3|ogg|m4a)$") + pattern_vits_onnx = re.compile(r".* VITS-onnx$") + pattern_openai_tts = re.compile(r".* OpenAI-TTS$") + + all_segments = result_diarize["segments"] + + speakers_edge = find_spkr(pattern_edge, speaker_to_voice, all_segments) + speakers_bark = find_spkr(pattern_bark, speaker_to_voice, all_segments) + speakers_vits = find_spkr(pattern_vits, speaker_to_voice, all_segments) + speakers_coqui = find_spkr(pattern_coqui, speaker_to_voice, all_segments) + speakers_vits_onnx = find_spkr( + pattern_vits_onnx, speaker_to_voice, all_segments + ) + speakers_openai_tts = find_spkr( + pattern_openai_tts, speaker_to_voice, all_segments + ) + + # Filter method in segments + filtered_edge = filter_by_speaker(speakers_edge, all_segments) + filtered_bark = filter_by_speaker(speakers_bark, all_segments) + filtered_vits = filter_by_speaker(speakers_vits, all_segments) + filtered_coqui = filter_by_speaker(speakers_coqui, all_segments) + filtered_vits_onnx = filter_by_speaker(speakers_vits_onnx, all_segments) + filtered_openai_tts = filter_by_speaker(speakers_openai_tts, all_segments) + + # Infer + if filtered_edge["segments"]: + logger.info(f"EDGE TTS: {speakers_edge}") + segments_egde_tts(filtered_edge, TRANSLATE_AUDIO_TO, is_gui) # mp3 + if filtered_bark["segments"]: + logger.info(f"BARK TTS: {speakers_bark}") + segments_bark_tts( + filtered_bark, TRANSLATE_AUDIO_TO, model_id_bark + ) # wav + if filtered_vits["segments"]: + logger.info(f"VITS TTS: {speakers_vits}") + segments_vits_tts(filtered_vits, TRANSLATE_AUDIO_TO) # wav + if filtered_coqui["segments"]: + logger.info(f"Coqui TTS: {speakers_coqui}") + segments_coqui_tts( + filtered_coqui, + TRANSLATE_AUDIO_TO, + model_id_coqui, + speakers_coqui, + delete_previous_automatic, + dereverb_automatic, + ) # wav + if filtered_vits_onnx["segments"]: + logger.info(f"PIPER TTS: {speakers_vits_onnx}") + segments_vits_onnx_tts(filtered_vits_onnx, TRANSLATE_AUDIO_TO) # wav + if filtered_openai_tts["segments"]: + logger.info(f"OpenAI TTS: {speakers_openai_tts}") + segments_openai_tts(filtered_openai_tts, TRANSLATE_AUDIO_TO) # wav + + [result.pop("tts_name", None) for result in result_diarize["segments"]] + return [ + speakers_edge, + speakers_bark, + speakers_vits, + speakers_coqui, + speakers_vits_onnx, + speakers_openai_tts + ] + + +def accelerate_segments( + result_diarize, + max_accelerate_audio, + valid_speakers, + acceleration_rate_regulation=False, + folder_output="audio2", +): + logger.info("Apply acceleration") + + ( + speakers_edge, + speakers_bark, + speakers_vits, + speakers_coqui, + speakers_vits_onnx, + speakers_openai_tts + ) = valid_speakers + + create_directories(f"{folder_output}/audio/") + remove_directory_contents(f"{folder_output}/audio/") + + audio_files = [] + speakers_list = [] + + max_count_segments_idx = len(result_diarize["segments"]) - 1 + + for i, segment in tqdm(enumerate(result_diarize["segments"])): + text = segment["text"] # noqa + start = segment["start"] + end = segment["end"] + speaker = segment["speaker"] + + # find name audio + # if speaker in speakers_edge: + filename = f"audio/{start}.ogg" + # elif speaker in speakers_bark + speakers_vits + speakers_coqui + speakers_vits_onnx: + # filename = f"audio/{start}.wav" # wav + + # duration + duration_true = end - start + duration_tts = librosa.get_duration(filename=filename) + + # Accelerate percentage + acc_percentage = duration_tts / duration_true + + # Smoth + if acceleration_rate_regulation and acc_percentage >= 1.3: + try: + next_segment = result_diarize["segments"][ + min(max_count_segments_idx, i + 1) + ] + next_start = next_segment["start"] + next_speaker = next_segment["speaker"] + duration_with_next_start = next_start - start + + if duration_with_next_start > duration_true: + extra_time = duration_with_next_start - duration_true + + if speaker == next_speaker: + # half + smoth_duration = duration_true + (extra_time * 0.5) + else: + # 7/10 + smoth_duration = duration_true + (extra_time * 0.7) + logger.debug( + f"Base acc: {acc_percentage}, " + f"smoth acc: {duration_tts / smoth_duration}" + ) + acc_percentage = max(1.2, (duration_tts / smoth_duration)) + + except Exception as error: + logger.error(str(error)) + + if acc_percentage > max_accelerate_audio: + acc_percentage = max_accelerate_audio + elif acc_percentage <= 1.15 and acc_percentage >= 0.8: + acc_percentage = 1.0 + elif acc_percentage <= 0.79: + acc_percentage = 0.8 + + # Round + acc_percentage = round(acc_percentage + 0.0, 1) + + # Format read if need + if speaker in speakers_edge: + info_enc = sf.info(filename).format + else: + info_enc = "OGG" + + # Apply aceleration or opposite to the audio file in folder_output folder + if acc_percentage == 1.0 and info_enc == "OGG": + copy_files(filename, f"{folder_output}{os.sep}audio") + else: + os.system( + f"ffmpeg -y -loglevel panic -i {filename} -filter:a atempo={acc_percentage} {folder_output}/{filename}" + ) + + if logger.isEnabledFor(logging.DEBUG): + duration_create = librosa.get_duration( + filename=f"{folder_output}/{filename}" + ) + logger.debug( + f"acc_percen is {acc_percentage}, tts duration " + f"is {duration_tts}, new duration is {duration_create}" + f", for {filename}" + ) + + audio_files.append(f"{folder_output}/{filename}") + speaker = "TTS Speaker {:02d}".format(int(speaker[-2:]) + 1) + speakers_list.append(speaker) + + return audio_files, speakers_list + + +# ===================================== +# Tone color converter +# ===================================== + + +def se_process_audio_segments( + source_seg, tone_color_converter, device, remove_previous_processed=True +): + # list wav seg + source_audio_segs = glob.glob(f"{source_seg}/*.wav") + if not source_audio_segs: + raise ValueError( + f"No audio segments found in {str(source_audio_segs)}" + ) + + source_se_path = os.path.join(source_seg, "se.pth") + + # if exist not create wav + if os.path.isfile(source_se_path): + se = torch.load(source_se_path).to(device) + logger.debug(f"Previous created {source_se_path}") + else: + se = tone_color_converter.extract_se(source_audio_segs, source_se_path) + + return se + + +def create_wav_vc( + valid_speakers, + segments_base, + audio_name, + max_segments=10, + target_dir="processed", + get_vocals_dereverb=False, +): + # valid_speakers = list({item['speaker'] for item in segments_base}) + + # Before function delete automatic delete_previous_automatic + output_dir = os.path.join(".", target_dir) # remove content + # remove_directory_contents(output_dir) + + path_source_segments = [] + path_target_segments = [] + for speaker in valid_speakers: + filtered_speaker = [ + segment + for segment in segments_base + if segment["speaker"] == speaker + ] + if len(filtered_speaker) > 4: + filtered_speaker = filtered_speaker[1:] + + dir_name_speaker = speaker + audio_name + dir_name_speaker_tts = "tts" + speaker + audio_name + dir_path_speaker = os.path.join(output_dir, dir_name_speaker) + dir_path_speaker_tts = os.path.join(output_dir, dir_name_speaker_tts) + create_directories([dir_path_speaker, dir_path_speaker_tts]) + + path_target_segments.append(dir_path_speaker) + path_source_segments.append(dir_path_speaker_tts) + + # create wav + max_segments_count = 0 + for seg in filtered_speaker: + duration = float(seg["end"]) - float(seg["start"]) + if duration > 3.0 and duration < 18.0: + logger.info( + f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {duration}, {seg["text"]}' + ) + name_new_wav = str(seg["start"]) + + check_segment_audio_target_file = os.path.join( + dir_path_speaker, f"{name_new_wav}.wav" + ) + + if os.path.exists(check_segment_audio_target_file): + logger.debug( + "Segment vc source exists: " + f"{check_segment_audio_target_file}" + ) + pass + else: + create_wav_file_vc( + sample_name=name_new_wav, + audio_wav="audio.wav", + start=(float(seg["start"]) + 1.0), + end=(float(seg["end"]) - 1.0), + output_final_path=dir_path_speaker, + get_vocals_dereverb=get_vocals_dereverb, + ) + + file_name_tts = f"audio2/audio/{str(seg['start'])}.ogg" + # copy_files(file_name_tts, os.path.join(output_dir, dir_name_speaker_tts) + convert_to_xtts_good_sample( + file_name_tts, dir_path_speaker_tts + ) + + max_segments_count += 1 + if max_segments_count == max_segments: + break + + if max_segments_count == 0: + logger.info("Taking the first segment") + seg = filtered_speaker[0] + logger.info( + f'Processing segment: {seg["start"]}, {seg["end"]}, {seg["speaker"]}, {seg["text"]}' + ) + max_duration = float(seg["end"]) - float(seg["start"]) + max_duration = max(1.0, min(max_duration, 18.0)) + + name_new_wav = str(seg["start"]) + create_wav_file_vc( + sample_name=name_new_wav, + audio_wav="audio.wav", + start=(float(seg["start"])), + end=(float(seg["start"]) + max_duration), + output_final_path=dir_path_speaker, + get_vocals_dereverb=get_vocals_dereverb, + ) + + file_name_tts = f"audio2/audio/{str(seg['start'])}.ogg" + # copy_files(file_name_tts, os.path.join(output_dir, dir_name_speaker_tts) + convert_to_xtts_good_sample(file_name_tts, dir_path_speaker_tts) + + logger.debug(f"Base: {str(path_source_segments)}") + logger.debug(f"Target: {str(path_target_segments)}") + + return path_source_segments, path_target_segments + + +def toneconverter_openvoice( + result_diarize, + preprocessor_max_segments, + remove_previous_process=True, + get_vocals_dereverb=False, + model="openvoice", +): + audio_path = "audio.wav" + # se_path = "se.pth" + target_dir = "processed" + create_directories(target_dir) + + from openvoice import se_extractor + from openvoice.api import ToneColorConverter + + audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{se_extractor.hash_numpy_array(audio_path)}" + # se_path = os.path.join(target_dir, audio_name, 'se.pth') + + # create wav seg original and target + + valid_speakers = list( + {item["speaker"] for item in result_diarize["segments"]} + ) + + logger.info("Openvoice preprocessor...") + + if remove_previous_process: + remove_directory_contents(target_dir) + + path_source_segments, path_target_segments = create_wav_vc( + valid_speakers, + result_diarize["segments"], + audio_name, + max_segments=preprocessor_max_segments, + get_vocals_dereverb=get_vocals_dereverb, + ) + + logger.info("Openvoice loading model...") + model_path_openvoice = "./OPENVOICE_MODELS" + url_model_openvoice = "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter" + + if "v2" in model: + model_path = os.path.join(model_path_openvoice, "v2") + url_model_openvoice = url_model_openvoice.replace( + "OpenVoice", "OpenVoiceV2" + ).replace("checkpoints/", "") + else: + model_path = os.path.join(model_path_openvoice, "v1") + create_directories(model_path) + + config_url = f"{url_model_openvoice}/config.json" + checkpoint_url = f"{url_model_openvoice}/checkpoint.pth" + + config_path = download_manager(url=config_url, path=model_path) + checkpoint_path = download_manager( + url=checkpoint_url, path=model_path + ) + + device = os.environ.get("SONITR_DEVICE") + tone_color_converter = ToneColorConverter(config_path, device=device) + tone_color_converter.load_ckpt(checkpoint_path) + + logger.info("Openvoice tone color converter:") + global_progress_bar = tqdm(total=len(result_diarize["segments"]), desc="Progress") + + for source_seg, target_seg, speaker in zip( + path_source_segments, path_target_segments, valid_speakers + ): + # source_se_path = os.path.join(source_seg, 'se.pth') + source_se = se_process_audio_segments(source_seg, tone_color_converter, device) + # target_se_path = os.path.join(target_seg, 'se.pth') + target_se = se_process_audio_segments(target_seg, tone_color_converter, device) + + # Iterate throw segments + encode_message = "@MyShell" + filtered_speaker = [ + segment + for segment in result_diarize["segments"] + if segment["speaker"] == speaker + ] + for seg in filtered_speaker: + src_path = ( + save_path + ) = f"audio2/audio/{str(seg['start'])}.ogg" # overwrite + logger.debug(f"{src_path}") + + tone_color_converter.convert( + audio_src_path=src_path, + src_se=source_se, + tgt_se=target_se, + output_path=save_path, + message=encode_message, + ) + + global_progress_bar.update(1) + + global_progress_bar.close() + + try: + del tone_color_converter + gc.collect() + torch.cuda.empty_cache() + except Exception as error: + logger.error(str(error)) + gc.collect() + torch.cuda.empty_cache() + + +def toneconverter_freevc( + result_diarize, + remove_previous_process=True, + get_vocals_dereverb=False, +): + audio_path = "audio.wav" + target_dir = "processed" + create_directories(target_dir) + + from openvoice import se_extractor + + audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{se_extractor.hash_numpy_array(audio_path)}" + + # create wav seg; original is target and dubbing is source + valid_speakers = list( + {item["speaker"] for item in result_diarize["segments"]} + ) + + logger.info("FreeVC preprocessor...") + + if remove_previous_process: + remove_directory_contents(target_dir) + + path_source_segments, path_target_segments = create_wav_vc( + valid_speakers, + result_diarize["segments"], + audio_name, + max_segments=1, + get_vocals_dereverb=get_vocals_dereverb, + ) + + logger.info("FreeVC loading model...") + device_id = os.environ.get("SONITR_DEVICE") + device = None if device_id == "cpu" else device_id + try: + from TTS.api import TTS + tts = TTS( + model_name="voice_conversion_models/multilingual/vctk/freevc24", + progress_bar=False + ).to(device) + except Exception as error: + logger.error(str(error)) + logger.error("Error loading the FreeVC model.") + return + + logger.info("FreeVC process:") + global_progress_bar = tqdm(total=len(result_diarize["segments"]), desc="Progress") + + for source_seg, target_seg, speaker in zip( + path_source_segments, path_target_segments, valid_speakers + ): + + filtered_speaker = [ + segment + for segment in result_diarize["segments"] + if segment["speaker"] == speaker + ] + + files_and_directories = os.listdir(target_seg) + wav_files = [file for file in files_and_directories if file.endswith(".wav")] + original_wav_audio_segment = os.path.join(target_seg, wav_files[0]) + + for seg in filtered_speaker: + + src_path = ( + save_path + ) = f"audio2/audio/{str(seg['start'])}.ogg" # overwrite + logger.debug(f"{src_path} - {original_wav_audio_segment}") + + wav = tts.voice_conversion( + source_wav=src_path, + target_wav=original_wav_audio_segment, + ) + + sf.write( + file=save_path, + samplerate=tts.voice_converter.vc_config.audio.output_sample_rate, + data=wav, + format="ogg", + subtype="vorbis", + ) + + global_progress_bar.update(1) + + global_progress_bar.close() + + try: + del tts + gc.collect() + torch.cuda.empty_cache() + except Exception as error: + logger.error(str(error)) + gc.collect() + torch.cuda.empty_cache() + + +def toneconverter( + result_diarize, + preprocessor_max_segments, + remove_previous_process=True, + get_vocals_dereverb=False, + method_vc="freevc" +): + + if method_vc == "freevc": + if preprocessor_max_segments > 1: + logger.info("FreeVC only uses one segment.") + return toneconverter_freevc( + result_diarize, + remove_previous_process=remove_previous_process, + get_vocals_dereverb=get_vocals_dereverb, + ) + elif "openvoice" in method_vc: + return toneconverter_openvoice( + result_diarize, + preprocessor_max_segments, + remove_previous_process=remove_previous_process, + get_vocals_dereverb=get_vocals_dereverb, + model=method_vc, + ) + + +if __name__ == "__main__": + from segments import result_diarize + + audio_segmentation_to_voice( + result_diarize, + TRANSLATE_AUDIO_TO="en", + max_accelerate_audio=2.1, + is_gui=True, + tts_voice00="en-facebook-mms VITS", + tts_voice01="en-CA-ClaraNeural-Female", + tts_voice02="en-GB-ThomasNeural-Male", + tts_voice03="en-GB-SoniaNeural-Female", + tts_voice04="en-NZ-MitchellNeural-Male", + tts_voice05="en-GB-MaisieNeural-Female", + ) diff --git a/soni_translate/translate_segments.py b/soni_translate/translate_segments.py new file mode 100644 index 0000000000000000000000000000000000000000..0ee87db9d96104b937e352360df1ed924ea227a6 --- /dev/null +++ b/soni_translate/translate_segments.py @@ -0,0 +1,457 @@ +from tqdm import tqdm +from deep_translator import GoogleTranslator +from itertools import chain +import copy +from .language_configuration import fix_code_language, INVERTED_LANGUAGES +from .logging_setup import logger +import re +import json +import time + +TRANSLATION_PROCESS_OPTIONS = [ + "google_translator_batch", + "google_translator", + "gpt-3.5-turbo-0125_batch", + "gpt-3.5-turbo-0125", + "gpt-4-turbo-preview_batch", + "gpt-4-turbo-preview", + "disable_translation", +] +DOCS_TRANSLATION_PROCESS_OPTIONS = [ + "google_translator", + "gpt-3.5-turbo-0125", + "gpt-4-turbo-preview", + "disable_translation", +] + + +def translate_iterative(segments, target, source=None): + """ + Translate text segments individually to the specified language. + + Parameters: + - segments (list): A list of dictionaries with 'text' as a key for + segment text. + - target (str): Target language code. + - source (str, optional): Source language code. Defaults to None. + + Returns: + - list: Translated text segments in the target language. + + Notes: + - Translates each segment using Google Translate. + + Example: + segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] + translated_segments = translate_iterative(segments, 'es') + """ + + segments_ = copy.deepcopy(segments) + + if ( + not source + ): + logger.debug("No source language") + source = "auto" + + translator = GoogleTranslator(source=source, target=target) + + for line in tqdm(range(len(segments_))): + text = segments_[line]["text"] + translated_line = translator.translate(text.strip()) + segments_[line]["text"] = translated_line + + return segments_ + + +def verify_translate( + segments, + segments_copy, + translated_lines, + target, + source +): + """ + Verify integrity and translate segments if lengths match, otherwise + switch to iterative translation. + """ + if len(segments) == len(translated_lines): + for line in range(len(segments_copy)): + logger.debug( + f"{segments_copy[line]['text']} >> " + f"{translated_lines[line].strip()}" + ) + segments_copy[line]["text"] = translated_lines[ + line].replace("\t", "").replace("\n", "").strip() + return segments_copy + else: + logger.error( + "The translation failed, switching to google_translate iterative. " + f"{len(segments), len(translated_lines)}" + ) + return translate_iterative(segments, target, source) + + +def translate_batch(segments, target, chunk_size=2000, source=None): + """ + Translate a batch of text segments into the specified language in chunks, + respecting the character limit. + + Parameters: + - segments (list): List of dictionaries with 'text' as a key for segment + text. + - target (str): Target language code. + - chunk_size (int, optional): Maximum character limit for each translation + chunk (default is 2000; max 5000). + - source (str, optional): Source language code. Defaults to None. + + Returns: + - list: Translated text segments in the target language. + + Notes: + - Splits input segments into chunks respecting the character limit for + translation. + - Translates the chunks using Google Translate. + - If chunked translation fails, switches to iterative translation using + `translate_iterative()`. + + Example: + segments = [{'text': 'first segment.'}, {'text': 'second segment.'}] + translated = translate_batch(segments, 'es', chunk_size=4000, source='en') + """ + + segments_copy = copy.deepcopy(segments) + + if ( + not source + ): + logger.debug("No source language") + source = "auto" + + # Get text + text_lines = [] + for line in range(len(segments_copy)): + text = segments_copy[line]["text"].strip() + text_lines.append(text) + + # chunk limit + text_merge = [] + actual_chunk = "" + global_text_list = [] + actual_text_list = [] + for one_line in text_lines: + one_line = " " if not one_line else one_line + if (len(actual_chunk) + len(one_line)) <= chunk_size: + if actual_chunk: + actual_chunk += " ||||| " + actual_chunk += one_line + actual_text_list.append(one_line) + else: + text_merge.append(actual_chunk) + actual_chunk = one_line + global_text_list.append(actual_text_list) + actual_text_list = [one_line] + if actual_chunk: + text_merge.append(actual_chunk) + global_text_list.append(actual_text_list) + + # translate chunks + progress_bar = tqdm(total=len(segments), desc="Translating") + translator = GoogleTranslator(source=source, target=target) + split_list = [] + try: + for text, text_iterable in zip(text_merge, global_text_list): + translated_line = translator.translate(text.strip()) + split_text = translated_line.split("|||||") + if len(split_text) == len(text_iterable): + progress_bar.update(len(split_text)) + else: + logger.debug( + "Chunk fixing iteratively. Len chunk: " + f"{len(split_text)}, expected: {len(text_iterable)}" + ) + split_text = [] + for txt_iter in text_iterable: + translated_txt = translator.translate(txt_iter.strip()) + split_text.append(translated_txt) + progress_bar.update(1) + split_list.append(split_text) + progress_bar.close() + except Exception as error: + progress_bar.close() + logger.error(str(error)) + logger.warning( + "The translation in chunks failed, switching to iterative." + " Related: too many request" + ) # use proxy or less chunk size + return translate_iterative(segments, target, source) + + # un chunk + translated_lines = list(chain.from_iterable(split_list)) + + return verify_translate( + segments, segments_copy, translated_lines, target, source + ) + + +def call_gpt_translate( + client, + model, + system_prompt, + user_prompt, + original_text=None, + batch_lines=None, +): + + # https://platform.openai.com/docs/guides/text-generation/json-mode + response = client.chat.completions.create( + model=model, + response_format={"type": "json_object"}, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ] + ) + result = response.choices[0].message.content + logger.debug(f"Result: {str(result)}") + + try: + translation = json.loads(result) + except Exception as error: + match_result = re.search(r'\{.*?\}', result) + if match_result: + logger.error(str(error)) + json_str = match_result.group(0) + translation = json.loads(json_str) + else: + raise error + + # Get valid data + if batch_lines: + for conversation in translation.values(): + if isinstance(conversation, dict): + conversation = list(conversation.values())[0] + if ( + list( + original_text["conversation"][0].values() + )[0].strip() == + list(conversation[0].values())[0].strip() + ): + continue + if len(conversation) == batch_lines: + break + + fix_conversation_length = [] + for line in conversation: + for speaker_code, text_tr in line.items(): + fix_conversation_length.append({speaker_code: text_tr}) + + logger.debug(f"Data batch: {str(fix_conversation_length)}") + logger.debug( + f"Lines Received: {len(fix_conversation_length)}," + f" expected: {batch_lines}" + ) + + return fix_conversation_length + + else: + if isinstance(translation, dict): + translation = list(translation.values())[0] + if isinstance(translation, list): + translation = translation[0] + if isinstance(translation, set): + translation = list(translation)[0] + if not isinstance(translation, str): + raise ValueError(f"No valid response received: {str(translation)}") + + return translation + + +def gpt_sequential(segments, model, target, source=None): + from openai import OpenAI + + translated_segments = copy.deepcopy(segments) + + client = OpenAI() + progress_bar = tqdm(total=len(segments), desc="Translating") + + lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() + lang_sc = "" + if source: + lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() + + fixed_target = fix_code_language(target) + fixed_source = fix_code_language(source) if source else "auto" + + system_prompt = "Machine translation designed to output the translated_text JSON." + + for i, line in enumerate(translated_segments): + text = line["text"].strip() + start = line["start"] + user_prompt = f"Translate the following {lang_sc} text into {lang_tg}, write the fully translated text and nothing more:\n{text}" + + time.sleep(0.5) + + try: + translated_text = call_gpt_translate( + client, + model, + system_prompt, + user_prompt, + ) + + except Exception as error: + logger.error( + f"{str(error)} >> The text of segment {start} " + "is being corrected with Google Translate" + ) + translator = GoogleTranslator( + source=fixed_source, target=fixed_target + ) + translated_text = translator.translate(text.strip()) + + translated_segments[i]["text"] = translated_text.strip() + progress_bar.update(1) + + progress_bar.close() + + return translated_segments + + +def gpt_batch(segments, model, target, token_batch_limit=900, source=None): + from openai import OpenAI + import tiktoken + + token_batch_limit = max(100, (token_batch_limit - 40) // 2) + progress_bar = tqdm(total=len(segments), desc="Translating") + segments_copy = copy.deepcopy(segments) + encoding = tiktoken.get_encoding("cl100k_base") + client = OpenAI() + + lang_tg = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[target]).strip() + lang_sc = "" + if source: + lang_sc = re.sub(r'\([^)]*\)', '', INVERTED_LANGUAGES[source]).strip() + + fixed_target = fix_code_language(target) + fixed_source = fix_code_language(source) if source else "auto" + + name_speaker = "ABCDEFGHIJKL" + + translated_lines = [] + text_data_dict = [] + num_tokens = 0 + count_sk = {char: 0 for char in "ABCDEFGHIJKL"} + + for i, line in enumerate(segments_copy): + text = line["text"] + speaker = line["speaker"] + last_start = line["start"] + # text_data_dict.append({str(int(speaker[-1])+1): text}) + index_sk = int(speaker[-2:]) + character_sk = name_speaker[index_sk] + count_sk[character_sk] += 1 + code_sk = character_sk+str(count_sk[character_sk]) + text_data_dict.append({code_sk: text}) + num_tokens += len(encoding.encode(text)) + 7 + if num_tokens >= token_batch_limit or i == len(segments_copy)-1: + try: + batch_lines = len(text_data_dict) + batch_conversation = {"conversation": copy.deepcopy(text_data_dict)} + # Reset vars + num_tokens = 0 + text_data_dict = [] + count_sk = {char: 0 for char in "ABCDEFGHIJKL"} + # Process translation + # https://arxiv.org/pdf/2309.03409.pdf + system_prompt = f"Machine translation designed to output the translated_conversation key JSON containing a list of {batch_lines} items." + user_prompt = f"Translate each of the following text values in conversation{' from' if lang_sc else ''} {lang_sc} to {lang_tg}:\n{batch_conversation}" + logger.debug(f"Prompt: {str(user_prompt)}") + + conversation = call_gpt_translate( + client, + model, + system_prompt, + user_prompt, + original_text=batch_conversation, + batch_lines=batch_lines, + ) + + if len(conversation) < batch_lines: + raise ValueError( + "Incomplete result received. Batch lines: " + f"{len(conversation)}, expected: {batch_lines}" + ) + + for i, translated_text in enumerate(conversation): + if i+1 > batch_lines: + break + translated_lines.append(list(translated_text.values())[0]) + + progress_bar.update(batch_lines) + + except Exception as error: + logger.error(str(error)) + + first_start = segments_copy[max(0, i-(batch_lines-1))]["start"] + logger.warning( + f"The batch from {first_start} to {last_start} " + "failed, is being corrected with Google Translate" + ) + + translator = GoogleTranslator( + source=fixed_source, + target=fixed_target + ) + + for txt_source in batch_conversation["conversation"]: + translated_txt = translator.translate( + list(txt_source.values())[0].strip() + ) + translated_lines.append(translated_txt.strip()) + progress_bar.update(1) + + progress_bar.close() + + return verify_translate( + segments, segments_copy, translated_lines, fixed_target, fixed_source + ) + + +def translate_text( + segments, + target, + translation_process="google_translator_batch", + chunk_size=4500, + source=None, + token_batch_limit=1000, +): + """Translates text segments using a specified process.""" + match translation_process: + case "google_translator_batch": + return translate_batch( + segments, + fix_code_language(target), + chunk_size, + fix_code_language(source) + ) + case "google_translator": + return translate_iterative( + segments, + fix_code_language(target), + fix_code_language(source) + ) + case model if model in ["gpt-3.5-turbo-0125", "gpt-4-turbo-preview"]: + return gpt_sequential(segments, model, target, source) + case model if model in ["gpt-3.5-turbo-0125_batch", "gpt-4-turbo-preview_batch",]: + return gpt_batch( + segments, + translation_process.replace("_batch", ""), + target, + token_batch_limit, + source + ) + case "disable_translation": + return segments + case _: + raise ValueError("No valid translation process") diff --git a/soni_translate/utils.py b/soni_translate/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..88e555671df448b8871dd525203170706e768f2c --- /dev/null +++ b/soni_translate/utils.py @@ -0,0 +1,487 @@ +import os, zipfile, rarfile, shutil, subprocess, shlex, sys # noqa +from .logging_setup import logger +from urllib.parse import urlparse +from IPython.utils import capture +import re + +VIDEO_EXTENSIONS = [ + ".mp4", + ".avi", + ".mov", + ".mkv", + ".wmv", + ".flv", + ".webm", + ".m4v", + ".mpeg", + ".mpg", + ".3gp" +] + +AUDIO_EXTENSIONS = [ + ".mp3", + ".wav", + ".aiff", + ".aif", + ".flac", + ".aac", + ".ogg", + ".wma", + ".m4a", + ".alac", + ".pcm", + ".opus", + ".ape", + ".amr", + ".ac3", + ".vox", + ".caf" +] + +SUBTITLE_EXTENSIONS = [ + ".srt", + ".vtt", + ".ass" +] + + +def run_command(command): + logger.debug(command) + if isinstance(command, str): + command = shlex.split(command) + + sub_params = { + "stdout": subprocess.PIPE, + "stderr": subprocess.PIPE, + "creationflags": subprocess.CREATE_NO_WINDOW + if sys.platform == "win32" + else 0, + } + process_command = subprocess.Popen(command, **sub_params) + output, errors = process_command.communicate() + if ( + process_command.returncode != 0 + ): # or not os.path.exists(mono_path) or os.path.getsize(mono_path) == 0: + logger.error("Error comnand") + raise Exception(errors.decode()) + + +def print_tree_directory(root_dir, indent=""): + if not os.path.exists(root_dir): + logger.error(f"{indent} Invalid directory or file: {root_dir}") + return + + items = os.listdir(root_dir) + + for index, item in enumerate(sorted(items)): + item_path = os.path.join(root_dir, item) + is_last_item = index == len(items) - 1 + + if os.path.isfile(item_path) and item_path.endswith(".zip"): + with zipfile.ZipFile(item_path, "r") as zip_file: + print( + f"{indent}{'└──' if is_last_item else '├──'} {item} (zip file)" + ) + zip_contents = zip_file.namelist() + for zip_item in sorted(zip_contents): + print( + f"{indent}{' ' if is_last_item else '│ '}{zip_item}" + ) + else: + print(f"{indent}{'└──' if is_last_item else '├──'} {item}") + + if os.path.isdir(item_path): + new_indent = indent + (" " if is_last_item else "│ ") + print_tree_directory(item_path, new_indent) + + +def upload_model_list(): + weight_root = "weights" + models = [] + for name in os.listdir(weight_root): + if name.endswith(".pth"): + models.append("weights/" + name) + if models: + logger.debug(models) + + index_root = "logs" + index_paths = [None] + for name in os.listdir(index_root): + if name.endswith(".index"): + index_paths.append("logs/" + name) + if index_paths: + logger.debug(index_paths) + + return models, index_paths + + +def manual_download(url, dst): + if "drive.google" in url: + logger.info("Drive url") + if "folders" in url: + logger.info("folder") + os.system(f'gdown --folder "{url}" -O {dst} --fuzzy -c') + else: + logger.info("single") + os.system(f'gdown "{url}" -O {dst} --fuzzy -c') + elif "huggingface" in url: + logger.info("HuggingFace url") + if "/blob/" in url or "/resolve/" in url: + if "/blob/" in url: + url = url.replace("/blob/", "/resolve/") + download_manager(url=url, path=dst, overwrite=True, progress=True) + else: + os.system(f"git clone {url} {dst+'repo/'}") + elif "http" in url: + logger.info("URL") + download_manager(url=url, path=dst, overwrite=True, progress=True) + elif os.path.exists(url): + logger.info("Path") + copy_files(url, dst) + else: + logger.error(f"No valid URL: {url}") + + +def download_list(text_downloads): + + if os.environ.get("ZERO_GPU") == "TRUE": + raise RuntimeError("This option is disabled in this demo.") + + try: + urls = [elem.strip() for elem in text_downloads.split(",")] + except Exception as error: + raise ValueError(f"No valid URL. {str(error)}") + + create_directories(["downloads", "logs", "weights"]) + + path_download = "downloads/" + for url in urls: + manual_download(url, path_download) + + # Tree + print("####################################") + print_tree_directory("downloads", indent="") + print("####################################") + + # Place files + select_zip_and_rar_files("downloads/") + + models, _ = upload_model_list() + + # hf space models files delete + remove_directory_contents("downloads/repo") + + return f"Downloaded = {models}" + + +def select_zip_and_rar_files(directory_path="downloads/"): + # filter + zip_files = [] + rar_files = [] + + for file_name in os.listdir(directory_path): + if file_name.endswith(".zip"): + zip_files.append(file_name) + elif file_name.endswith(".rar"): + rar_files.append(file_name) + + # extract + for file_name in zip_files: + file_path = os.path.join(directory_path, file_name) + with zipfile.ZipFile(file_path, "r") as zip_ref: + zip_ref.extractall(directory_path) + + for file_name in rar_files: + file_path = os.path.join(directory_path, file_name) + with rarfile.RarFile(file_path, "r") as rar_ref: + rar_ref.extractall(directory_path) + + # set in path + def move_files_with_extension(src_dir, extension, destination_dir): + for root, _, files in os.walk(src_dir): + for file_name in files: + if file_name.endswith(extension): + source_file = os.path.join(root, file_name) + destination = os.path.join(destination_dir, file_name) + shutil.move(source_file, destination) + + move_files_with_extension(directory_path, ".index", "logs/") + move_files_with_extension(directory_path, ".pth", "weights/") + + return "Download complete" + + +def is_file_with_extensions(string_path, extensions): + return any(string_path.lower().endswith(ext) for ext in extensions) + + +def is_video_file(string_path): + return is_file_with_extensions(string_path, VIDEO_EXTENSIONS) + + +def is_audio_file(string_path): + return is_file_with_extensions(string_path, AUDIO_EXTENSIONS) + + +def is_subtitle_file(string_path): + return is_file_with_extensions(string_path, SUBTITLE_EXTENSIONS) + + +def get_directory_files(directory): + audio_files = [] + video_files = [] + sub_files = [] + + for item in os.listdir(directory): + item_path = os.path.join(directory, item) + + if os.path.isfile(item_path): + + if is_audio_file(item_path): + audio_files.append(item_path) + + elif is_video_file(item_path): + video_files.append(item_path) + + elif is_subtitle_file(item_path): + sub_files.append(item_path) + + logger.info( + f"Files in path ({directory}): " + f"{str(audio_files + video_files + sub_files)}" + ) + + return audio_files, video_files, sub_files + + +def get_valid_files(paths): + valid_paths = [] + for path in paths: + if os.path.isdir(path): + audio_files, video_files, sub_files = get_directory_files(path) + valid_paths.extend(audio_files) + valid_paths.extend(video_files) + valid_paths.extend(sub_files) + else: + valid_paths.append(path) + + return valid_paths + + +def extract_video_links(link): + + params_dlp = {"quiet": False, "no_warnings": True, "noplaylist": False} + + try: + from yt_dlp import YoutubeDL + with capture.capture_output() as cap: + with YoutubeDL(params_dlp) as ydl: + info_dict = ydl.extract_info( # noqa + link, download=False, process=True + ) + + urls = re.findall(r'\[youtube\] Extracting URL: (.*?)\n', cap.stdout) + logger.info(f"List of videos in ({link}): {str(urls)}") + del cap + except Exception as error: + logger.error(f"{link} >> {str(error)}") + urls = [link] + + return urls + + +def get_link_list(urls): + valid_links = [] + for url_video in urls: + if "youtube.com" in url_video and "/watch?v=" not in url_video: + url_links = extract_video_links(url_video) + valid_links.extend(url_links) + else: + valid_links.append(url_video) + return valid_links + +# ===================================== +# Download Manager +# ===================================== + + +def load_file_from_url( + url: str, + model_dir: str, + file_name: str | None = None, + overwrite: bool = False, + progress: bool = True, +) -> str: + """Download a file from `url` into `model_dir`, + using the file present if possible. + + Returns the path to the downloaded file. + """ + os.makedirs(model_dir, exist_ok=True) + if not file_name: + parts = urlparse(url) + file_name = os.path.basename(parts.path) + cached_file = os.path.abspath(os.path.join(model_dir, file_name)) + + # Overwrite + if os.path.exists(cached_file): + if overwrite or os.path.getsize(cached_file) == 0: + remove_files(cached_file) + + # Download + if not os.path.exists(cached_file): + logger.info(f'Downloading: "{url}" to {cached_file}\n') + from torch.hub import download_url_to_file + + download_url_to_file(url, cached_file, progress=progress) + else: + logger.debug(cached_file) + + return cached_file + + +def friendly_name(file: str): + if file.startswith("http"): + file = urlparse(file).path + + file = os.path.basename(file) + model_name, extension = os.path.splitext(file) + return model_name, extension + + +def download_manager( + url: str, + path: str, + extension: str = "", + overwrite: bool = False, + progress: bool = True, +): + url = url.strip() + + name, ext = friendly_name(url) + name += ext if not extension else f".{extension}" + + if url.startswith("http"): + filename = load_file_from_url( + url=url, + model_dir=path, + file_name=name, + overwrite=overwrite, + progress=progress, + ) + else: + filename = path + + return filename + + +# ===================================== +# File management +# ===================================== + + +# only remove files +def remove_files(file_list): + if isinstance(file_list, str): + file_list = [file_list] + + for file in file_list: + if os.path.exists(file): + os.remove(file) + + +def remove_directory_contents(directory_path): + """ + Removes all files and subdirectories within a directory. + + Parameters: + directory_path (str): Path to the directory whose + contents need to be removed. + """ + if os.path.exists(directory_path): + for filename in os.listdir(directory_path): + file_path = os.path.join(directory_path, filename) + try: + if os.path.isfile(file_path): + os.remove(file_path) + elif os.path.isdir(file_path): + shutil.rmtree(file_path) + except Exception as e: + logger.error(f"Failed to delete {file_path}. Reason: {e}") + logger.info(f"Content in '{directory_path}' removed.") + else: + logger.error(f"Directory '{directory_path}' does not exist.") + + +# Create directory if not exists +def create_directories(directory_path): + if isinstance(directory_path, str): + directory_path = [directory_path] + for one_dir_path in directory_path: + if not os.path.exists(one_dir_path): + os.makedirs(one_dir_path) + logger.debug(f"Directory '{one_dir_path}' created.") + + +def move_files(source_dir, destination_dir, extension=""): + """ + Moves file(s) from the source path to the destination path. + + Parameters: + source_dir (str): Path to the source directory. + destination_dir (str): Path to the destination directory. + extension (str): Only move files with this extension. + """ + create_directories(destination_dir) + + for filename in os.listdir(source_dir): + source_path = os.path.join(source_dir, filename) + destination_path = os.path.join(destination_dir, filename) + if extension and not filename.endswith(extension): + continue + os.replace(source_path, destination_path) + + +def copy_files(source_path, destination_path): + """ + Copies a file or multiple files from a source path to a destination path. + + Parameters: + source_path (str or list): Path or list of paths to the source + file(s) or directory. + destination_path (str): Path to the destination directory. + """ + create_directories(destination_path) + + if isinstance(source_path, str): + source_path = [source_path] + + if os.path.isdir(source_path[0]): + # Copy all files from the source directory to the destination directory + base_path = source_path[0] + source_path = os.listdir(source_path[0]) + source_path = [ + os.path.join(base_path, file_name) for file_name in source_path + ] + + for one_source_path in source_path: + if os.path.exists(one_source_path): + shutil.copy2(one_source_path, destination_path) + logger.debug( + f"File '{one_source_path}' copied to '{destination_path}'." + ) + else: + logger.error(f"File '{one_source_path}' does not exist.") + + +def rename_file(current_name, new_name): + file_directory = os.path.dirname(current_name) + + if os.path.exists(current_name): + dir_new_name_file = os.path.join(file_directory, new_name) + os.rename(current_name, dir_new_name_file) + logger.debug(f"File '{current_name}' renamed to '{new_name}'.") + return dir_new_name_file + else: + logger.error(f"File '{current_name}' does not exist.") + return None diff --git a/vci_pipeline.py b/vci_pipeline.py new file mode 100644 index 0000000000000000000000000000000000000000..f67e3eccb75c4f5054187f00c95091990e51ce58 --- /dev/null +++ b/vci_pipeline.py @@ -0,0 +1,454 @@ +import numpy as np, parselmouth, torch, pdb, sys +from time import time as ttime +import torch.nn.functional as F +import scipy.signal as signal +import pyworld, os, traceback, faiss, librosa, torchcrepe +from scipy import signal +from functools import lru_cache +from soni_translate.logging_setup import logger + +now_dir = os.getcwd() +sys.path.append(now_dir) + +bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) + +input_audio_path2wav = {} + + +@lru_cache +def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): + audio = input_audio_path2wav[input_audio_path] + f0, t = pyworld.harvest( + audio, + fs=fs, + f0_ceil=f0max, + f0_floor=f0min, + frame_period=frame_period, + ) + f0 = pyworld.stonemask(audio, f0, t, fs) + return f0 + + +def change_rms(data1, sr1, data2, sr2, rate): # 1 is the input audio, 2 is the output audio, rate is the proportion of 2 + # print(data1.max(),data2.max()) + rms1 = librosa.feature.rms( + y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 + ) # one dot every half second + rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) + rms1 = torch.from_numpy(rms1) + rms1 = F.interpolate( + rms1.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.from_numpy(rms2) + rms2 = F.interpolate( + rms2.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) + data2 *= ( + torch.pow(rms1, torch.tensor(1 - rate)) + * torch.pow(rms2, torch.tensor(rate - 1)) + ).numpy() + return data2 + + +class VC(object): + def __init__(self, tgt_sr, config): + self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( + config.x_pad, + config.x_query, + config.x_center, + config.x_max, + config.is_half, + ) + self.sr = 16000 # hubert input sampling rate + self.window = 160 # points per frame + self.t_pad = self.sr * self.x_pad # Pad time before and after each bar + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sr * self.x_query # Query time before and after the cut point + self.t_center = self.sr * self.x_center # Query point cut position + self.t_max = self.sr * self.x_max # Query-free duration threshold + self.device = config.device + + def get_f0( + self, + input_audio_path, + x, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0=None, + ): + global input_audio_path2wav + time_step = self.window / self.sr * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + if f0_method == "pm": + f0 = ( + parselmouth.Sound(x, self.sr) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": + input_audio_path2wav[input_audio_path] = x.astype(np.double) + f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) + if filter_radius > 2: + f0 = signal.medfilt(f0, 3) + elif f0_method == "crepe": + model = "full" + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 + # Compute pitch using first gpu + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + self.sr, + self.window, + f0_min, + f0_max, + model, + batch_size=batch_size, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + elif "rmvpe" in f0_method: + if hasattr(self, "model_rmvpe") == False: + from lib.rmvpe import RMVPE + + logger.info("Loading vocal pitch estimator model") + self.model_rmvpe = RMVPE( + "rmvpe.pt", is_half=self.is_half, device=self.device + ) + thred = 0.03 + if "+" in f0_method: + f0 = self.model_rmvpe.pitch_based_audio_inference(x, thred, f0_min, f0_max) + else: + f0 = self.model_rmvpe.infer_from_audio(x, thred) + + f0 *= pow(2, f0_up_key / 12) + # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + tf0 = self.sr // self.window # f0 points per second + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + :shape + ] + # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + try: + f0_coarse = np.rint(f0_mel).astype(np.int) + except: # noqa + f0_coarse = np.rint(f0_mel).astype(int) + return f0_coarse, f0bak # 1-0 + + def vc( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + ): # ,file_index,file_big_npy + feats = torch.from_numpy(audio0) + if self.is_half: + feats = feats.half() + else: + feats = feats.float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + + inputs = { + "source": feats.to(self.device), + "padding_mask": padding_mask, + "output_layer": 9 if version == "v1" else 12, + } + t0 = ttime() + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) if version == "v1" else logits[0] + if protect < 0.5 and pitch != None and pitchf != None: + feats0 = feats.clone() + if ( + isinstance(index, type(None)) == False + and isinstance(big_npy, type(None)) == False + and index_rate != 0 + ): + npy = feats[0].cpu().numpy() + if self.is_half: + npy = npy.astype("float32") + + # _, I = index.search(npy, 1) + # npy = big_npy[I.squeeze()] + + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + if protect < 0.5 and pitch != None and pitchf != None: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + t1 = ttime() + p_len = audio0.shape[0] // self.window + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch != None and pitchf != None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + + if protect < 0.5 and pitch != None and pitchf != None: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + pitchff = pitchff.unsqueeze(-1) + feats = feats * pitchff + feats0 * (1 - pitchff) + feats = feats.to(feats0.dtype) + p_len = torch.tensor([p_len], device=self.device).long() + with torch.no_grad(): + if pitch != None and pitchf != None: + audio1 = ( + (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) + .data.cpu() + .float() + .numpy() + ) + else: + audio1 = ( + (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() + ) + del feats, p_len, padding_mask + if torch.cuda.is_available(): + torch.cuda.empty_cache() + t2 = ttime() + times[0] += t1 - t0 + times[2] += t2 - t1 + return audio1 + + def pipeline( + self, + model, + net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + # file_big_npy, + index_rate, + if_f0, + filter_radius, + tgt_sr, + resample_sr, + rms_mix_rate, + version, + protect, + f0_file=None, + ): + if ( + file_index != "" + # and file_big_npy != "" + # and os.path.exists(file_big_npy) == True + and os.path.exists(file_index) == True + and index_rate != 0 + ): + try: + index = faiss.read_index(file_index) + # big_npy = np.load(file_big_npy) + big_npy = index.reconstruct_n(0, index.ntotal) + except: + traceback.print_exc() + index = big_npy = None + else: + index = big_npy = None + logger.warning("File index Not found, set None") + + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += audio_pad[i : i - self.window] + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query : t + self.t_query]) + == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() + )[0][0] + ) + s = 0 + audio_opt = [] + t = None + t1 = ttime() + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name") == True: + try: + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except: + traceback.print_exc() + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + pitch, pitchf = None, None + if if_f0 == 1: + pitch, pitchf = self.get_f0( + input_audio_path, + audio_pad, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if self.device == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + t2 = ttime() + times[1] += t2 - t1 + for t in opt_ts: + t = t // self.window * self.window + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s : t + self.t_pad2 + self.window], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + s = t + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt : -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + if rms_mix_rate != 1: + audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) + if resample_sr >= 16000 and tgt_sr != resample_sr: + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + ) + audio_max = np.abs(audio_opt).max() / 0.99 + max_int16 = 32768 + if audio_max > 1: + max_int16 /= audio_max + audio_opt = (audio_opt * max_int16).astype(np.int16) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt diff --git a/voice_main.py b/voice_main.py new file mode 100644 index 0000000000000000000000000000000000000000..5e9c1ee8b1c18a438f9d5ead9232753b071a9b2e --- /dev/null +++ b/voice_main.py @@ -0,0 +1,732 @@ +from soni_translate.logging_setup import logger +import torch +import gc +import numpy as np +import os +import shutil +import warnings +import threading +from tqdm import tqdm +from lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) +from lib.audio import load_audio +import soundfile as sf +import edge_tts +import asyncio +from soni_translate.utils import remove_directory_contents, create_directories +from scipy import signal +from time import time as ttime +import faiss +from vci_pipeline import VC, change_rms, bh, ah +import librosa + +warnings.filterwarnings("ignore") + + +class Config: + def __init__(self, only_cpu=False): + self.device = "cuda:0" + self.is_half = True + self.n_cpu = 0 + self.gpu_name = None + self.gpu_mem = None + ( + self.x_pad, + self.x_query, + self.x_center, + self.x_max + ) = self.device_config(only_cpu) + + def device_config(self, only_cpu) -> tuple: + if torch.cuda.is_available() and not only_cpu: + i_device = int(self.device.split(":")[-1]) + self.gpu_name = torch.cuda.get_device_name(i_device) + if ( + ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) + or "P40" in self.gpu_name.upper() + or "1060" in self.gpu_name + or "1070" in self.gpu_name + or "1080" in self.gpu_name + ): + logger.info( + "16/10 Series GPUs and P40 excel " + "in single-precision tasks." + ) + self.is_half = False + else: + self.gpu_name = None + self.gpu_mem = int( + torch.cuda.get_device_properties(i_device).total_memory + / 1024 + / 1024 + / 1024 + + 0.4 + ) + elif torch.backends.mps.is_available() and not only_cpu: + logger.info("Supported N-card not found, using MPS for inference") + self.device = "mps" + else: + logger.info("No supported N-card found, using CPU for inference") + self.device = "cpu" + self.is_half = False + + if self.n_cpu == 0: + self.n_cpu = os.cpu_count() + + if self.is_half: + # 6GB VRAM configuration + x_pad = 3 + x_query = 10 + x_center = 60 + x_max = 65 + else: + # 5GB VRAM configuration + x_pad = 1 + x_query = 6 + x_center = 38 + x_max = 41 + + if self.gpu_mem is not None and self.gpu_mem <= 4: + x_pad = 1 + x_query = 5 + x_center = 30 + x_max = 32 + + logger.info( + f"Config: Device is {self.device}, " + f"half precision is {self.is_half}" + ) + + return x_pad, x_query, x_center, x_max + + +BASE_DOWNLOAD_LINK = "https://huggingface.co/r3gm/sonitranslate_voice_models/resolve/main/" +BASE_MODELS = [ + "hubert_base.pt", + "rmvpe.pt" +] +BASE_DIR = "." + + +def load_hu_bert(config): + from fairseq import checkpoint_utils + from soni_translate.utils import download_manager + + for id_model in BASE_MODELS: + download_manager( + os.path.join(BASE_DOWNLOAD_LINK, id_model), BASE_DIR + ) + + models, _, _ = checkpoint_utils.load_model_ensemble_and_task( + ["hubert_base.pt"], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(config.device) + if config.is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() + hubert_model.eval() + + return hubert_model + + +def load_trained_model(model_path, config): + + if not model_path: + raise ValueError("No model found") + + logger.info("Loading %s" % model_path) + cpt = torch.load(model_path, map_location="cpu") + tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk + if_f0 = cpt.get("f0", 1) + if if_f0 == 0: + # protect to 0.5 need? + pass + + version = cpt.get("version", "v1") + if version == "v1": + if if_f0 == 1: + net_g = SynthesizerTrnMs256NSFsid( + *cpt["config"], is_half=config.is_half + ) + else: + net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) + elif version == "v2": + if if_f0 == 1: + net_g = SynthesizerTrnMs768NSFsid( + *cpt["config"], is_half=config.is_half + ) + else: + net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) + del net_g.enc_q + + net_g.load_state_dict(cpt["weight"], strict=False) + net_g.eval().to(config.device) + + if config.is_half: + net_g = net_g.half() + else: + net_g = net_g.float() + + vc = VC(tgt_sr, config) + n_spk = cpt["config"][-3] + + return n_spk, tgt_sr, net_g, vc, cpt, version + + +class ClassVoices: + def __init__(self, only_cpu=False): + self.model_config = {} + self.config = None + self.only_cpu = only_cpu + + def apply_conf( + self, + tag="base_model", + file_model="", + pitch_algo="pm", + pitch_lvl=0, + file_index="", + index_influence=0.66, + respiration_median_filtering=3, + envelope_ratio=0.25, + consonant_breath_protection=0.33, + resample_sr=0, + file_pitch_algo="", + ): + + if not file_model: + raise ValueError("Model not found") + + if file_index is None: + file_index = "" + + if file_pitch_algo is None: + file_pitch_algo = "" + + if not self.config: + self.config = Config(self.only_cpu) + self.hu_bert_model = None + self.model_pitch_estimator = None + + self.model_config[tag] = { + "file_model": file_model, + "pitch_algo": pitch_algo, + "pitch_lvl": pitch_lvl, # no decimal + "file_index": file_index, + "index_influence": index_influence, + "respiration_median_filtering": respiration_median_filtering, + "envelope_ratio": envelope_ratio, + "consonant_breath_protection": consonant_breath_protection, + "resample_sr": resample_sr, + "file_pitch_algo": file_pitch_algo, + } + return f"CONFIGURATION APPLIED FOR {tag}: {file_model}" + + def infer( + self, + task_id, + params, + # load model + n_spk, + tgt_sr, + net_g, + pipe, + cpt, + version, + if_f0, + # load index + index_rate, + index, + big_npy, + # load f0 file + inp_f0, + # audio file + input_audio_path, + overwrite, + ): + + f0_method = params["pitch_algo"] + f0_up_key = params["pitch_lvl"] + filter_radius = params["respiration_median_filtering"] + resample_sr = params["resample_sr"] + rms_mix_rate = params["envelope_ratio"] + protect = params["consonant_breath_protection"] + + if not os.path.exists(input_audio_path): + raise ValueError( + "The audio file was not found or is not " + f"a valid file: {input_audio_path}" + ) + + f0_up_key = int(f0_up_key) + + audio = load_audio(input_audio_path, 16000) + + # Normalize audio + audio_max = np.abs(audio).max() / 0.95 + if audio_max > 1: + audio /= audio_max + + times = [0, 0, 0] + + # filters audio signal, pads it, computes sliding window sums, + # and extracts optimized time indices + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad( + audio, (pipe.window // 2, pipe.window // 2), mode="reflect" + ) + opt_ts = [] + if audio_pad.shape[0] > pipe.t_max: + audio_sum = np.zeros_like(audio) + for i in range(pipe.window): + audio_sum += audio_pad[i:i - pipe.window] + for t in range(pipe.t_center, audio.shape[0], pipe.t_center): + opt_ts.append( + t + - pipe.t_query + + np.where( + np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]) + == np.abs(audio_sum[t - pipe.t_query: t + pipe.t_query]).min() + )[0][0] + ) + + s = 0 + audio_opt = [] + t = None + t1 = ttime() + + sid_value = 0 + sid = torch.tensor(sid_value, device=pipe.device).unsqueeze(0).long() + + # Pads audio symmetrically, calculates length divided by window size. + audio_pad = np.pad(audio, (pipe.t_pad, pipe.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // pipe.window + + # Estimates pitch from audio signal + pitch, pitchf = None, None + if if_f0 == 1: + pitch, pitchf = pipe.get_f0( + input_audio_path, + audio_pad, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if pipe.device == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor( + pitch, device=pipe.device + ).unsqueeze(0).long() + pitchf = torch.tensor( + pitchf, device=pipe.device + ).unsqueeze(0).float() + + t2 = ttime() + times[1] += t2 - t1 + for t in opt_ts: + t = t // pipe.window * pipe.window + if if_f0 == 1: + pitch_slice = pitch[ + :, s // pipe.window: (t + pipe.t_pad2) // pipe.window + ] + pitchf_slice = pitchf[ + :, s // pipe.window: (t + pipe.t_pad2) // pipe.window + ] + else: + pitch_slice = None + pitchf_slice = None + + audio_slice = audio_pad[s:t + pipe.t_pad2 + pipe.window] + audio_opt.append( + pipe.vc( + self.hu_bert_model, + net_g, + sid, + audio_slice, + pitch_slice, + pitchf_slice, + times, + index, + big_npy, + index_rate, + version, + protect, + )[pipe.t_pad_tgt:-pipe.t_pad_tgt] + ) + s = t + + pitch_end_slice = pitch[ + :, t // pipe.window: + ] if t is not None else pitch + pitchf_end_slice = pitchf[ + :, t // pipe.window: + ] if t is not None else pitchf + + audio_opt.append( + pipe.vc( + self.hu_bert_model, + net_g, + sid, + audio_pad[t:], + pitch_end_slice, + pitchf_end_slice, + times, + index, + big_npy, + index_rate, + version, + protect, + )[pipe.t_pad_tgt:-pipe.t_pad_tgt] + ) + + audio_opt = np.concatenate(audio_opt) + if rms_mix_rate != 1: + audio_opt = change_rms( + audio, 16000, audio_opt, tgt_sr, rms_mix_rate + ) + if resample_sr >= 16000 and tgt_sr != resample_sr: + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + ) + audio_max = np.abs(audio_opt).max() / 0.99 + max_int16 = 32768 + if audio_max > 1: + max_int16 /= audio_max + audio_opt = (audio_opt * max_int16).astype(np.int16) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + if tgt_sr != resample_sr >= 16000: + final_sr = resample_sr + else: + final_sr = tgt_sr + + """ + "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % ( + times[0], + times[1], + times[2], + ), (final_sr, audio_opt) + + """ + + if overwrite: + output_audio_path = input_audio_path # Overwrite + else: + basename = os.path.basename(input_audio_path) + dirname = os.path.dirname(input_audio_path) + + new_basename = basename.split( + '.')[0] + "_edited." + basename.split('.')[-1] + new_path = os.path.join(dirname, new_basename) + logger.info(str(new_path)) + + output_audio_path = new_path + + # Save file + sf.write( + file=output_audio_path, + samplerate=final_sr, + data=audio_opt + ) + + self.model_config[task_id]["result"].append(output_audio_path) + self.output_list.append(output_audio_path) + + def make_test( + self, + tts_text, + tts_voice, + model_path, + index_path, + transpose, + f0_method, + ): + + folder_test = "test" + tag = "test_edge" + tts_file = "test/test.wav" + tts_edited = "test/test_edited.wav" + + create_directories(folder_test) + remove_directory_contents(folder_test) + + if "SET_LIMIT" == os.getenv("DEMO"): + if len(tts_text) > 60: + tts_text = tts_text[:60] + logger.warning("DEMO; limit to 60 characters") + + try: + asyncio.run(edge_tts.Communicate( + tts_text, "-".join(tts_voice.split('-')[:-1]) + ).save(tts_file)) + except Exception as e: + raise ValueError( + "No audio was received. Please change the " + f"tts voice for {tts_voice}. Error: {str(e)}" + ) + + shutil.copy(tts_file, tts_edited) + + self.apply_conf( + tag=tag, + file_model=model_path, + pitch_algo=f0_method, + pitch_lvl=transpose, + file_index=index_path, + index_influence=0.66, + respiration_median_filtering=3, + envelope_ratio=0.25, + consonant_breath_protection=0.33, + ) + + self( + audio_files=tts_edited, + tag_list=tag, + overwrite=True + ) + + return tts_edited, tts_file + + def run_threads(self, threads): + # Start threads + for thread in threads: + thread.start() + + # Wait for all threads to finish + for thread in threads: + thread.join() + + gc.collect() + torch.cuda.empty_cache() + + def unload_models(self): + self.hu_bert_model = None + self.model_pitch_estimator = None + gc.collect() + torch.cuda.empty_cache() + + def __call__( + self, + audio_files=[], + tag_list=[], + overwrite=False, + parallel_workers=1, + ): + logger.info(f"Parallel workers: {str(parallel_workers)}") + + self.output_list = [] + + if not self.model_config: + raise ValueError("No model has been configured for inference") + + if isinstance(audio_files, str): + audio_files = [audio_files] + if isinstance(tag_list, str): + tag_list = [tag_list] + + if not audio_files: + raise ValueError("No audio found to convert") + if not tag_list: + tag_list = [list(self.model_config.keys())[-1]] * len(audio_files) + + if len(audio_files) > len(tag_list): + logger.info("Extend tag list to match audio files") + extend_number = len(audio_files) - len(tag_list) + tag_list.extend([tag_list[0]] * extend_number) + + if len(audio_files) < len(tag_list): + logger.info("Cut list tags") + tag_list = tag_list[:len(audio_files)] + + tag_file_pairs = list(zip(tag_list, audio_files)) + sorted_tag_file = sorted(tag_file_pairs, key=lambda x: x[0]) + + # Base params + if not self.hu_bert_model: + self.hu_bert_model = load_hu_bert(self.config) + + cache_params = None + threads = [] + progress_bar = tqdm(total=len(tag_list), desc="Progress") + for i, (id_tag, input_audio_path) in enumerate(sorted_tag_file): + + if id_tag not in self.model_config.keys(): + logger.info( + f"No configured model for {id_tag} with {input_audio_path}" + ) + continue + + if ( + len(threads) >= parallel_workers + or cache_params != id_tag + and cache_params is not None + ): + + self.run_threads(threads) + progress_bar.update(len(threads)) + + threads = [] + + if cache_params != id_tag: + + self.model_config[id_tag]["result"] = [] + + # Unload previous + ( + n_spk, + tgt_sr, + net_g, + pipe, + cpt, + version, + if_f0, + index_rate, + index, + big_npy, + inp_f0, + ) = [None] * 11 + gc.collect() + torch.cuda.empty_cache() + + # Model params + params = self.model_config[id_tag] + + model_path = params["file_model"] + f0_method = params["pitch_algo"] + file_index = params["file_index"] + index_rate = params["index_influence"] + f0_file = params["file_pitch_algo"] + + # Load model + ( + n_spk, + tgt_sr, + net_g, + pipe, + cpt, + version + ) = load_trained_model(model_path, self.config) + if_f0 = cpt.get("f0", 1) # pitch data + + # Load index + if os.path.exists(file_index) and index_rate != 0: + try: + index = faiss.read_index(file_index) + big_npy = index.reconstruct_n(0, index.ntotal) + except Exception as error: + logger.error(f"Index: {str(error)}") + index_rate = 0 + index = big_npy = None + else: + logger.warning("File index not found") + index_rate = 0 + index = big_npy = None + + # Load f0 file + inp_f0 = None + if os.path.exists(f0_file): + try: + with open(f0_file, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except Exception as error: + logger.error(f"f0 file: {str(error)}") + + if "rmvpe" in f0_method: + if not self.model_pitch_estimator: + from lib.rmvpe import RMVPE + + logger.info("Loading vocal pitch estimator model") + self.model_pitch_estimator = RMVPE( + "rmvpe.pt", + is_half=self.config.is_half, + device=self.config.device + ) + + pipe.model_rmvpe = self.model_pitch_estimator + + cache_params = id_tag + + # self.infer( + # id_tag, + # params, + # # load model + # n_spk, + # tgt_sr, + # net_g, + # pipe, + # cpt, + # version, + # if_f0, + # # load index + # index_rate, + # index, + # big_npy, + # # load f0 file + # inp_f0, + # # output file + # input_audio_path, + # overwrite, + # ) + + thread = threading.Thread( + target=self.infer, + args=( + id_tag, + params, + # loaded model + n_spk, + tgt_sr, + net_g, + pipe, + cpt, + version, + if_f0, + # loaded index + index_rate, + index, + big_npy, + # loaded f0 file + inp_f0, + # audio file + input_audio_path, + overwrite, + ) + ) + + threads.append(thread) + + # Run last + if threads: + self.run_threads(threads) + + progress_bar.update(len(threads)) + progress_bar.close() + + final_result = [] + valid_tags = set(tag_list) + for tag in valid_tags: + if ( + tag in self.model_config.keys() + and "result" in self.model_config[tag].keys() + ): + final_result.extend(self.model_config[tag]["result"]) + + return final_result