Spaces:

hf-audio
/

open_asr_leaderboard

Running on CPU Upgrade

App Files Files Community

open_asr_leaderboard / init.py

Steveeeeeeen HF Staff

Update init.py

1d75bfc verified 26 days ago

raw

history blame contribute delete

6.07 kB

	import os
	from constants import EVAL_REQUESTS_PATH
	from pathlib import Path
	from huggingface_hub import HfApi, Repository

	TOKEN_HUB = os.environ.get("TOKEN_HUB", None)
	QUEUE_REPO = os.environ.get("QUEUE_REPO", None)
	QUEUE_REPO_MULTI = os.environ.get("QUEUE_REPO_MULTI", None)
	QUEUE_REPO_LONGFORM = "Steveeeeeeen/leaderboard_longform"
	QUEUE_PATH = os.environ.get("QUEUE_PATH", None)
	QUEUE_PATH_MULTI = os.environ.get("QUEUE_PATH_MULTI", None)
	QUEUE_PATH_LONGFORM = "Steveeeeeeen/leaderboard_longform"

	hf_api = HfApi(
	endpoint="https://huggingface.co",
	token=TOKEN_HUB,
	)

	def load_all_info_from_dataset_hub():
	eval_queue_repo = None
	requested_models = None

	passed = True
	if TOKEN_HUB is None:
	passed = False
	else:
	print("Pulling evaluation requests and results.")

	eval_queue_repo = Repository(
	local_dir=QUEUE_PATH,
	clone_from=QUEUE_REPO,
	use_auth_token=TOKEN_HUB,
	repo_type="dataset",
	)
	eval_queue_repo.git_pull()

	# Local directory where dataset repo is cloned + folder with eval requests
	directory = QUEUE_PATH / EVAL_REQUESTS_PATH
	requested_models = get_all_requested_models(directory)
	requested_models = [p.stem for p in requested_models]
	# Local directory where dataset repo is cloned
	csv_results = get_csv_with_results(QUEUE_PATH)
	if csv_results is None:
	passed = False
	if not passed:
	raise ValueError("No Hugging Face token provided. Skipping evaluation requests and results.")

	# Load multilingual data in the same way
	multilingual_csv_results = load_multilingual_data()

	# Load longform data in the same way
	longform_csv_results = load_longform_data()

	return eval_queue_repo, requested_models, csv_results, multilingual_csv_results, longform_csv_results

	def load_multilingual_data():
	"""Load multilingual evaluation data from CSV"""
	multilingual_queue_path = QUEUE_PATH_MULTI

	try:
	# Try to get from dedicated multilingual HF repo first
	if TOKEN_HUB is not None:
	print("Pulling multilingual evaluation data.")
	try:
	multilingual_repo = Repository(
	local_dir=multilingual_queue_path,
	clone_from=QUEUE_REPO_MULTI,
	use_auth_token=TOKEN_HUB,
	repo_type="dataset",
	)
	multilingual_repo.git_pull()
	multilingual_csv = get_csv_with_results(multilingual_queue_path)
	except Exception as e:
	print(f"Failed to pull from multilingual repo: {e}")
	multilingual_csv = None
	else:
	multilingual_csv = None

	# Fallback to local file
	if multilingual_csv is None:
	print("Using local multilingual CSV file.")
	multilingual_csv = get_csv_with_results(".")

	return multilingual_csv
	except Exception as e:
	print(f"Error loading multilingual data: {e}")
	return None

	def load_longform_data():
	"""Load longform evaluation data from CSV"""
	longform_queue_path = QUEUE_PATH_LONGFORM

	try:
	# Try to get from dedicated longform HF repo first
	if TOKEN_HUB is not None:
	print("Pulling longform evaluation data.")
	try:
	longform_repo = Repository(
	local_dir=longform_queue_path,
	clone_from=QUEUE_REPO_LONGFORM,
	use_auth_token=TOKEN_HUB,
	repo_type="dataset",
	)
	longform_repo.git_pull()
	longform_csv = get_csv_with_results(longform_queue_path)
	except Exception as e:
	print(f"Failed to pull from longform repo: {e}")
	longform_csv = None
	else:
	longform_csv = None

	# Fallback to local file
	if longform_csv is None:
	print("Using local longform CSV file.")
	longform_csv = get_csv_with_results(".")

	return longform_csv
	except Exception as e:
	print(f"Error loading longform data: {e}")
	return None


	def upload_file(requested_model_name, path_or_fileobj):
	dest_repo_file = Path(EVAL_REQUESTS_PATH) / path_or_fileobj.name
	dest_repo_file = str(dest_repo_file)
	hf_api.upload_file(
	path_or_fileobj=path_or_fileobj,
	path_in_repo=str(dest_repo_file),
	repo_id=QUEUE_REPO,
	token=TOKEN_HUB,
	repo_type="dataset",
	commit_message=f"Add {requested_model_name} to eval queue")

	def get_all_requested_models(directory):
	directory = Path(directory)
	all_requested_models = list(directory.glob("*.txt"))
	return all_requested_models

	def get_csv_with_results(directory):
	directory = Path(directory)
	all_csv_files = list(directory.glob("*.csv"))
	latest = [f for f in all_csv_files if f.stem.endswith("latest")]
	if len(latest) != 1:
	return None
	return latest[0]

	def is_model_on_hub(model_name, revision="main") -> bool:
	try:
	model_name = model_name.replace(" ","")
	author = model_name.split("/")[0]
	model_id = model_name.split("/")[1]
	if len(author) == 0 or len(model_id) == 0:
	return False, "is not a valid model name. Please use the format `author/model_name`."
	except Exception as e:
	return False, "is not a valid model name. Please use the format `author/model_name`."

	try:
	models = list(hf_api.list_models(author=author, search=model_id))
	matched = [model_name for m in models if m.modelId == model_name]
	if len(matched) != 1:
	return False, "was not found on the hub!"
	else:
	return True, None
	except Exception as e:
	print(f"Could not get the model from the hub.: {e}")
	return False, "was not found on hub!"