import os from pathlib import Path import uuid import json from huggingface_hub import HfApi, HfFileSystem DATASET_REPO = "EarthSpeciesProject/naturelm-audio-space-logs" SPLIT = "test" TESTING = os.getenv("TESTING", "0") == "1" api = HfApi(token=os.getenv("HF_TOKEN",None)) # Upload audio # check if file exists hf_fs = HfFileSystem(token=os.getenv("HF_TOKEN",None)) def upload_data(audio: str | Path, user_text: str, model_response: str): data_id = str(uuid.uuid4()) if TESTING: data_id = "test-" + data_id # Audio path in repo suffix = Path(audio).suffix audio_p = f"{SPLIT}/audio/" + data_id + suffix api.upload_file( path_or_fileobj=str(audio), path_in_repo=audio_p, repo_id=DATASET_REPO, repo_type="dataset", ) text = { "user_message": user_text, "model_response": model_response, "file_name": "audio/" + data_id + suffix, # has to be relative to metadata.jsonl "original_fn": os.path.basename(audio), "id": data_id, } # Append to a jsonl file in the repo # APPEND DOESNT WORK, have to open first if hf_fs.exists(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl"): with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "r") as f: lines = f.readlines() lines.append(json.dumps(text) + "\n") with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f: f.writelines(lines) else: with hf_fs.open(f"datasets/{DATASET_REPO}/{SPLIT}/metadata.jsonl", "w") as f: f.write(json.dumps(text) + "\n") # Write a separate file instead # with hf_fs.open(f"datasets/{DATASET_REPO}/{data_id}.json", "w") as f: # json.dump(text, f)