Spaces:
Runtime error
Runtime error
| import requests | |
| import duckdb | |
| DATASET_VIEWER_API_URL = "https://datasets-server.huggingface.co/" | |
| session = requests.Session() | |
| def fetch_json(url, params=None, timeout=20): | |
| response = session.get(url, params=params, timeout=timeout) | |
| response.raise_for_status() | |
| data = response.json() | |
| if "error" in data: | |
| raise Exception(f"Error fetching data: {data['error']}") | |
| return data | |
| def get_split_rows(dataset, config, split): | |
| url = f"{DATASET_VIEWER_API_URL}/size" | |
| params = {"dataset": dataset, "config": config} | |
| config_size = fetch_json(url, params) | |
| split_size = next( | |
| (s for s in config_size["size"]["splits"] if s["split"] == split), None | |
| ) | |
| if split_size is None: | |
| raise Exception(f"Error fetching split {split} in config {config}") | |
| return split_size["num_rows"] | |
| def get_parquet_urls(dataset, config, split): | |
| url = f"{DATASET_VIEWER_API_URL}/parquet" | |
| params = {"dataset": dataset, "config": config, "split": split} | |
| parquet_files = fetch_json(url, params) | |
| parquet_urls = [file["url"] for file in parquet_files["parquet_files"]] | |
| return ",".join(f"'{url}'" for url in parquet_urls) | |
| def get_docs_from_parquet(parquet_urls, column, offset, limit): | |
| sql_query = f"SELECT {column} FROM read_parquet([{parquet_urls}]) LIMIT {limit} OFFSET {offset};" | |
| df = duckdb.sql(sql_query).to_df() | |
| return df[column].tolist() | |
| def get_info(dataset): | |
| url = f"{DATASET_VIEWER_API_URL}/info" | |
| params = {"dataset": dataset} | |
| info_resp = fetch_json(url, params) | |
| return info_resp["dataset_info"] | |