Spaces:

ybendou
/

Audio-Recorder-TTS

Sleeping

App Files Files Community

Audio-Recorder-TTS / app.py

ybendou

Adding documentation

b1f991d verified 7 months ago

raw

history blame contribute delete

8.23 kB

	import gradio as gr
	from datasets import load_dataset, Dataset, Audio, concatenate_datasets
	import json
	import os
	from datetime import datetime
	import shutil

	# Directory to save recordings
	AUDIO_DIR = "data/audios"
	SAMPLING_RATE = 16000
	os.makedirs(AUDIO_DIR, exist_ok=True)

	# State variables
	state = {
	"sentences": [],
	"recordings": {}, # Dictionary to store recordings by ID
	"index": 0, # Index for navigating through sentences
	"idx": 0, # Index for sentences (IDs)
	"json_loaded": False

	}

	def load_json(file):
	with open(file.name, "r", encoding="utf-8") as f:
	content = json.load(f)
	state["sentences"].extend(content)
	state["recordings"].update({k["id"]:[] for k in content})
	state["json_loaded"] = True
	return update_display()

	def update_display():
	if not state["sentences"]:
	return "No data loaded.", None, "", "", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)

	idx = state["index"]
	progress = ""
	if state["json_loaded"]:
	if idx >= len(state["sentences"]):
	export_json()
	return "✅ All sentences recorded!\n💾 Data Exported to Json", None, "", "", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)

	progress = 0
	for recordings in state["recordings"].values():
	if len(recordings) > 0:
	progress += 1
	progress = f"{progress} / {len(state['sentences'])} recorded"


	# Enable/Disable buttons based on the current index
	next_btn_enabled = gr.update(visible= not (state["index"] == len(state["sentences"]) - 1))
	prev_btn_enabled = gr.update(visible= not (state["index"] == 0))

	recordings = []
	text = ""
	current_id = f"s_{state['idx']}"
	if idx < len(state["sentences"]):
	current = state["sentences"][idx]
	current_id = current['id']
	text = current["text"]
	recordings = state["recordings"].get(current["id"], [])

	if recordings:
	# Get the most recent recording for that sentence ID
	current_recording = recordings[-1]
	current_audio = current_recording["audio"]
	audio_visibility = gr.update(visible=True)
	else:
	current_audio = None
	audio_visibility = gr.update(visible=False)

	return text, None, f"ID: {current_id}", progress, gr.update(visible=True), prev_btn_enabled, next_btn_enabled, current_audio, audio_visibility

	def record_audio(audio, text):
	if state["sentences"] and state["index"] >= len(state["sentences"]):
	return update_display()

	if audio is None:
	gr.Warning("The audio is empty, please provide a valid audio")
	return update_display()
	if state["json_loaded"]:
	state["sentences"][state["index"]]["text"] = text # overwrite with current written value
	else:
	state["sentences"].append({"id": f"s_{state['idx']}", "text": text})
	state["idx"] += 1

	sentence = state["sentences"][state["index"]]
	uid = sentence["id"]

	filename = f"{uid}_{datetime.now().strftime('%Y%m%d%H%M%S')}.wav"
	filepath = os.path.join(AUDIO_DIR, filename)

	shutil.copy(audio, filepath)

	# Add the new recording under the correct ID in the recordings dictionary

	uid_versioning = uid
	recordings = state["recordings"].get(uid, [])
	if recordings:
	uid_versioning = f"{uid}_v{len(recordings)}"

	state["recordings"].setdefault(uid, []).append({
	"id": uid_versioning,
	"text": sentence["text"],
	"audio": filepath
	})
	state["index"] += 1
	return update_display()

	def export_json():
	output_path = "data/tts_dataset.json"
	data = [record for records in state["recordings"].values() for record in records]
	if data:
	with open(output_path, "w") as f:
	json.dump(data, f, indent=2)
	else:
	gr.Warning("There is no recorded data")
	return output_path

	def go_previous():
	if state["index"] > 0:
	state["index"] -= 1
	return update_display()

	def go_next():
	if state["index"] < len(state["sentences"]) - 1:
	state["index"] += 1
	return update_display()
	def push_to_hub(hub_id, is_new_dataset, sampling_rate):
	if hub_id:
	# flatten recordings
	recordings = []
	for element in state["recordings"].values():
	for version in element:
	recordings.append({"id": version["id"], "audio": version["audio"], "text": version["text"]})

	dataset = Dataset.from_list(recordings)
	dataset = dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
	if not is_new_dataset:
	previous_dataset = load_dataset(hub_id, split="train")
	dataset = concatenate_datasets([previous_dataset, dataset])
	dataset.push_to_hub(hub_id)
	gr.Info("Succesfully synched with the hub")
	else:
	gr.Warning("The hub_id field is empty, please provide a relevant hub id.")
	return update_display()
	with gr.Blocks() as demo:
	gr.Markdown("""# 🗣️ TTS Dataset Recorder

	Welcome to the TTS Dataset Recorder! This tool helps you quickly create a high-quality dataset for Text-to-Speech (TTS) models. Whether you're starting from scratch or have a pre-existing set of text data, this app lets you record audio samples and export them with the corresponding metadata.

	### How to Use?
	1. Upload a JSON File containing the sentences you'd like to record (or manually input them through the app).
	2. Record Audio for each sentence. The app will automatically associate your recordings with the correct text.
	3. Export the Dataset as a JSON file or Sync to HuggingFace for easy sharing and use.

	### Data Input Format
	Your JSON file should follow this structure:
	```json
	[
	{ "id": "001", "text": "Hello, how are you?" },
	{ "id": "002", "text": "This is a sample sentence." }
	]

	""")

	with gr.Row():
	json_file = gr.File(label="Upload Sentences JSON", file_types=[".json"])
	with gr.Column():
	export_btn = gr.Button("💾 Export Metadata")
	with gr.Row():
	hub_id = gr.Textbox(label="Hub id", interactive=True)
	with gr.Row():
	is_new_dataset = gr.Checkbox(label="New dataset", interactive=True)
	sampling_rate = gr.Number(label="Sampling rate", value=SAMPLING_RATE, precision=0)
	push_to_hub_btn = gr.Button("🤗 Sync to HuggingFace")

	id_display = gr.Textbox(label="ID", interactive=False)
	progress_text = gr.Textbox(label="Progress", interactive=False)
	sentence_text = gr.Textbox(label="Sentence", interactive=True)
	audio_input = gr.Audio(type="filepath", label="Record your voice", interactive=True)
	record_btn = gr.Button("✅ Submit Recording")


	with gr.Row():
	prev_btn = gr.Button("⬅️ Previous")
	next_btn = gr.Button("➡️ Next")

	# audio_player = gr.Audio(label="Play Recorded Audio", interactive=False)
	audio_player = gr.Audio(label="Play Recorded Audio", type="filepath")


	json_file.change(load_json, inputs=json_file, outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])
	record_btn.click(record_audio, inputs=[audio_input, sentence_text], outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])
	export_btn.click(export_json, outputs=gr.File())

	prev_btn.click(go_previous, outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])
	next_btn.click(go_next, outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])


	push_to_hub_btn.click(push_to_hub, inputs=[hub_id, is_new_dataset], outputs=[sentence_text, audio_input, id_display, progress_text, record_btn, prev_btn, next_btn, audio_player, audio_player])

	demo.launch()