Spaces:
Running
Running
| """make_slakh16k_index.py | |
| USAGE: | |
| python tasks/utils/mirdata_dev/scripts/make_slakh_index.py '../data' '2100-yourmt3-16k' | |
| """ | |
| import argparse | |
| import glob | |
| import json | |
| import os | |
| import yaml | |
| from mirdata.validate import md5 | |
| def get_file_info(path): | |
| if os.path.exists(path): | |
| return [path, md5(path)] | |
| else: | |
| print("warning: {} not found. check metadata for omitted files.".format( | |
| path)) | |
| return [None, None] | |
| def make_dataset_index(dataset_data_path, version): | |
| curr_dir = os.getcwd() | |
| os.chdir(dataset_data_path) | |
| dataset_index_path = os.path.join(dataset_data_path, "mirdata_indexes", | |
| f"slakh_index_{version}.json") | |
| if version == "baby": | |
| splits = [""] | |
| topdir = "babyslakh_16k" | |
| fmt = "wav" | |
| elif version == "2100-yourmt3-16k": | |
| splits = ["train", "validation", "test"] | |
| topdir = "slakh2100_yourmt3_16k" | |
| fmt = "wav" | |
| elif version == "2100-redux": | |
| splits = ["train", "validation", "test", "omitted"] | |
| topdir = "slakh2100_flac_redux" | |
| fmt = "flac" | |
| multitrack_index = {} | |
| track_index = {} | |
| for split in splits: | |
| mtrack_ids = sorted([ | |
| os.path.basename(folder) | |
| for folder in glob.glob(os.path.join(topdir, split, "Track*")) | |
| ]) | |
| for mtrack_id in mtrack_ids: | |
| print(f'indexing multitrack: {mtrack_id}') | |
| mtrack_path = os.path.join(topdir, split, mtrack_id) | |
| metadata_path = os.path.join(mtrack_path, "metadata.yaml") | |
| with open(metadata_path, "r") as fhandle: | |
| metadata = yaml.safe_load(fhandle) | |
| mtrack_midi_path = os.path.join(mtrack_path, "all_src.mid") | |
| mix_path = os.path.join(mtrack_path, "mix.{}".format(fmt)) | |
| track_ids = [] | |
| for track_id in metadata["stems"].keys(): | |
| if metadata["stems"][track_id]["audio_rendered"] is not True: | |
| continue # <-- modified by @mimbres to avoid missing audio error | |
| if metadata["stems"][track_id]["midi_saved"] is not True: | |
| continue # <-- modified by @mimbres to avoid missing audio error | |
| audio_path = os.path.join(mtrack_path, "stems", | |
| "{}.{}".format(track_id, fmt)) | |
| midi_path = os.path.join(mtrack_path, "MIDI", | |
| "{}.mid".format(track_id)) | |
| midi_file_info = get_file_info(midi_path) | |
| # skip tracks where there is no midi information (and thus no audio) | |
| if midi_file_info[0] is None: | |
| continue | |
| if get_file_info(audio_path)[0] is None: | |
| continue # <-- modified by @mimbres to avoid missing audio error | |
| track_id = "{}-{}".format(mtrack_id, track_id) | |
| track_ids.append(track_id) | |
| track_index[track_id] = { | |
| "audio": get_file_info(audio_path), | |
| "midi": [midi_file_info[0], midi_file_info[1]], | |
| "metadata": get_file_info(metadata_path), | |
| } | |
| multitrack_index[mtrack_id] = { | |
| "tracks": track_ids, | |
| "midi": get_file_info(mtrack_midi_path), | |
| "mix": get_file_info(mix_path), | |
| "metadata": get_file_info(metadata_path), | |
| } | |
| # top-key level version | |
| dataset_index = { | |
| "version": version, | |
| "tracks": track_index, | |
| "multitracks": multitrack_index, | |
| } | |
| os.chdir(curr_dir) | |
| with open(dataset_index_path, "w") as fhandle: | |
| json.dump(dataset_index, fhandle, indent=2) | |
| def main(args): | |
| make_dataset_index(args.dataset_data_path, args.version) | |
| print( | |
| f"A new index file is copied to {args.dataset_data_path}/mirdata_indexes/" | |
| ) | |
| if __name__ == "__main__": | |
| PARSER = argparse.ArgumentParser(description="Make dataset index file.") | |
| PARSER.add_argument( | |
| "dataset_data_path", type=str, help="Path to dataset data folder.") | |
| PARSER.add_argument( | |
| "version", | |
| type=str, | |
| help="Dataset version. baby or 2100-redux or 2100-yourmt3-16k") | |
| main(PARSER.parse_args()) |