wav2vec2-large-xlsr-53-arabic-egyptian / dialects_speech_corpus.py

Create dialects_speech_corpus.py

3e1fdc5 over 4 years ago

3.67 kB

	"""Arabic Speech Corpus"""

	from __future__ import absolute_import, division, print_function

	import os

	import datasets


	_CITATION = """
	"""

	_DESCRIPTION = """\


	```python
	import soundfile as sf

	def map_to_array(batch):
	speech_array, _ = sf.read(batch["file"])
	batch["speech"] = speech_array
	return batch

	dataset = dataset.map(map_to_array, remove_columns=["file"])
	```
	"""

	_URL = "mgb3.zip"
	corrupt_files = ['familyKids_02_first_12min.wav','sports_04_first_12min.wav',
	'cooking_05_first_12min.wav', 'moviesDrama_07_first_12min.wav','science_06_first_12min.wav',
	'comedy_09_first_12min.wav','cultural_08_first_12min.wav','familyKids_11_first_12min.wav',
	'science_10_first_12min.wav']
	import soundfile as sf

	class EgyptianSpeechCorpusConfig(datasets.BuilderConfig):
	"""BuilderConfig for EgyptianSpeechCorpus."""

	def __init__(self, **kwargs):
	"""
	Args:
	data_dir: `string`, the path to the folder containing the files in the
	downloaded .tar
	citation: `string`, citation for the data set
	url: `string`, url for information about the data set
	**kwargs: keyword arguments forwarded to super.
	"""
	super(EgyptianSpeechCorpusConfig, self).__init__(version=datasets.Version("2.1.0", ""), **kwargs)


	def map_to_array(batch):
	start, stop = batch['segment'].split('_')
	speech_array, _ = sf.read(batch["file"], start = start, stop = stop)
	batch["speech"] = speech_array
	return batch

	class EgyptionSpeechCorpus(datasets.GeneratorBasedBuilder):
	"""EgyptianSpeechCorpus dataset."""

	BUILDER_CONFIGS = [
	EgyptianSpeechCorpusConfig(name="clean", description="'Clean' speech."),
	]

	def _info(self):
	return datasets.DatasetInfo(
	description=_DESCRIPTION,
	features=datasets.Features(
	{
	"file": datasets.Value("string"),
	"text": datasets.Value("string"),
	"segment": datasets.Value("string")
	}
	),
	supervised_keys=("file", "text"),
	homepage=_URL,
	citation=_CITATION,
	)

	def _split_generators(self, dl_manager):
	self.archive_path = '/content/mgb3'
	return [
	datasets.SplitGenerator(name="train", gen_kwargs={"archive_path": os.path.join(self.archive_path, "adapt")}),
	datasets.SplitGenerator(name="dev", gen_kwargs={"archive_path": os.path.join(self.archive_path, "dev")}),
	datasets.SplitGenerator(name="test", gen_kwargs={"archive_path": os.path.join(self.archive_path, "test")}),
	]

	def _generate_examples(self, archive_path):
	"""Generate examples from a Librispeech archive_path."""
	text_dir = os.path.join(archive_path, "Alaa")
	wav_dir = os.path.join(self.archive_path, "wav")

	segments_file = os.path.join(text_dir, "text_noverlap")

	with open(segments_file, "r", encoding="utf-8") as f:
	for _id, line in enumerate(f):
	segment = line.split(' ')[0]
	text = ' '.join(line.split(' ')[1:])
	wav_file = '_'.join(segment.split('_')[:4]) +'.wav'
	start, stop = segment.split('_')[4:6]
	wav_path = os.path.join(wav_dir, wav_file)
	if (wav_file in corrupt_files) or (wav_file not in os.listdir(wav_dir)):
	continue
	example = {
	"file": wav_path,
	"text": text,
	"segment":('_').join([start, stop])
	}
	yield str(_id), example