Commit
·
9d24b08
1
Parent(s):
c4b6820
Refactor dataset migration tool for GitHub and Kaggle datasets
Browse files
app.py
CHANGED
|
@@ -2,13 +2,15 @@ import contextlib
|
|
| 2 |
import re
|
| 3 |
import tempfile
|
| 4 |
from functools import lru_cache
|
|
|
|
| 5 |
|
| 6 |
import gradio as gr
|
| 7 |
from git import Repo
|
| 8 |
from httpx import Client
|
| 9 |
-
from typing import Optional
|
| 10 |
from huggingface_hub import create_repo, upload_folder
|
| 11 |
from toolz import groupby
|
|
|
|
|
|
|
| 12 |
|
| 13 |
client = Client()
|
| 14 |
|
|
@@ -46,7 +48,7 @@ def upload_directory_to_hf(
|
|
| 46 |
commit_message="Migrated from GitHub",
|
| 47 |
ignore_patterns=[
|
| 48 |
"*.git*",
|
| 49 |
-
|
| 50 |
"*.DS_Store",
|
| 51 |
"*.env",
|
| 52 |
], # ignore git files and .env files
|
|
@@ -132,6 +134,34 @@ def show_files_and_directories(url: str):
|
|
| 132 |
)
|
| 133 |
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
html_text_app_description = """
|
| 136 |
Whilst GitHub is great for hosting code the Hugging Face Datasets Hub is a better place to host datasets.
|
| 137 |
Some of the benefits of hosting datasets on the Hugging Face Datasets Hub are:
|
|
@@ -148,71 +178,101 @@ This app will help you migrate a dataset currently hosted on GitHub to the Huggi
|
|
| 148 |
|
| 149 |
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
| 150 |
gr.HTML(
|
| 151 |
-
"""<h1 style='text-align: center;'>
|
| 152 |
-
<center><i> ✨ Migrate
|
| 153 |
-
)
|
| 154 |
-
gr.HTML(
|
| 155 |
-
"""<center> GitHub is a great place for sharing code but the Hugging Face Hub has many advantages for sharing datasets.
|
| 156 |
-
<br> This Space will guide you through the process of migrating a dataset from GitHub to the Hugging Face Hub. </center>"""
|
| 157 |
)
|
|
|
|
| 158 |
with gr.Row():
|
| 159 |
gr.LoginButton(size="sm")
|
| 160 |
-
|
| 161 |
-
gr.
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
gr.Markdown(
|
| 213 |
"""You should add a dataset card for your dataset to help people discover and understand your dataset. You can find instructions for creating a dataset card [here](https://huggingface.co/docs/datasets/dataset_card).
|
| 214 |
If you have any questions or feedback feel free to reach out to us on using the [Discussion tab](https://huggingface.co/spaces/librarian-bots/github-to-huggingface-dataset-migration-tool/discussions/1)"""
|
| 215 |
)
|
| 216 |
|
| 217 |
-
|
| 218 |
demo.launch()
|
|
|
|
| 2 |
import re
|
| 3 |
import tempfile
|
| 4 |
from functools import lru_cache
|
| 5 |
+
from typing import Optional
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
from git import Repo
|
| 9 |
from httpx import Client
|
|
|
|
| 10 |
from huggingface_hub import create_repo, upload_folder
|
| 11 |
from toolz import groupby
|
| 12 |
+
import kagglehub
|
| 13 |
+
from kagglehub import KaggleDatasetAdapter
|
| 14 |
|
| 15 |
client = Client()
|
| 16 |
|
|
|
|
| 48 |
commit_message="Migrated from GitHub",
|
| 49 |
ignore_patterns=[
|
| 50 |
"*.git*",
|
| 51 |
+
# "*README.md*",
|
| 52 |
"*.DS_Store",
|
| 53 |
"*.env",
|
| 54 |
], # ignore git files and .env files
|
|
|
|
| 134 |
)
|
| 135 |
|
| 136 |
|
| 137 |
+
def push_kaggle_to_hf(
|
| 138 |
+
source_kaggle_dataset: str,
|
| 139 |
+
destination_hf_hub_repository: str,
|
| 140 |
+
file_path: str,
|
| 141 |
+
oauth_token: gr.OAuthToken,
|
| 142 |
+
):
|
| 143 |
+
"""Pushes a Kaggle dataset to HuggingFace Hub using the HF dataset adapter"""
|
| 144 |
+
if not file_path:
|
| 145 |
+
raise ValueError("File path must be specified for Kaggle datasets")
|
| 146 |
+
|
| 147 |
+
gr.Info("Loading Kaggle dataset...")
|
| 148 |
+
dataset = kagglehub.load_dataset(
|
| 149 |
+
KaggleDatasetAdapter.HUGGING_FACE,
|
| 150 |
+
source_kaggle_dataset,
|
| 151 |
+
file_path,
|
| 152 |
+
)
|
| 153 |
+
gr.Info("Loading Kaggle dataset...Done")
|
| 154 |
+
|
| 155 |
+
gr.Info("Pushing to Hugging Face Hub...")
|
| 156 |
+
dataset.push_to_hub(
|
| 157 |
+
destination_hf_hub_repository,
|
| 158 |
+
token=oauth_token.token,
|
| 159 |
+
)
|
| 160 |
+
gr.Info("Pushing to Hugging Face Hub...Done")
|
| 161 |
+
|
| 162 |
+
return f"Pushed the dataset to [{destination_hf_hub_repository}](https://huggingface.co/datasets/{destination_hf_hub_repository})"
|
| 163 |
+
|
| 164 |
+
|
| 165 |
html_text_app_description = """
|
| 166 |
Whilst GitHub is great for hosting code the Hugging Face Datasets Hub is a better place to host datasets.
|
| 167 |
Some of the benefits of hosting datasets on the Hugging Face Datasets Hub are:
|
|
|
|
| 178 |
|
| 179 |
with gr.Blocks(theme=gr.themes.Base()) as demo:
|
| 180 |
gr.HTML(
|
| 181 |
+
"""<h1 style='text-align: center;'> Dataset Migration Tool</h1>
|
| 182 |
+
<center><i> ✨ Migrate datasets to Hugging Face Hub in a few steps ✨</i></center>"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
)
|
| 184 |
+
|
| 185 |
with gr.Row():
|
| 186 |
gr.LoginButton(size="sm")
|
| 187 |
+
|
| 188 |
+
with gr.Tabs() as tabs:
|
| 189 |
+
with gr.Tab("GitHub"):
|
| 190 |
+
gr.Markdown("### Location of existing dataset")
|
| 191 |
+
gr.Markdown(
|
| 192 |
+
"URL for the GitHub repository where the dataset is currently hosted"
|
| 193 |
+
)
|
| 194 |
+
source_github_repository = gr.Textbox(
|
| 195 |
+
lines=1, label="Source GitHub Repository URL"
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
with gr.Accordion("Advanced Options", open=False):
|
| 199 |
+
gr.Markdown("### Select files and folder to migrate")
|
| 200 |
+
gr.Markdown(
|
| 201 |
+
"(Optional): select a specific folder and/or files to migrate from the GitHub repository. If you select a folder all the files in that folder will be migrated."
|
| 202 |
+
)
|
| 203 |
+
folder_in_github_repo = gr.Dropdown(
|
| 204 |
+
None,
|
| 205 |
+
label="Folder in the GitHub Repository to migrate",
|
| 206 |
+
allow_custom_value=True,
|
| 207 |
+
visible=True,
|
| 208 |
+
)
|
| 209 |
+
files_in_github_repo = gr.Dropdown(
|
| 210 |
+
None,
|
| 211 |
+
label="Files in GitHub Repository to migrate",
|
| 212 |
+
allow_custom_value=True,
|
| 213 |
+
visible=True,
|
| 214 |
+
)
|
| 215 |
+
source_github_repository.change(
|
| 216 |
+
show_files_and_directories,
|
| 217 |
+
[source_github_repository],
|
| 218 |
+
[folder_in_github_repo, files_in_github_repo],
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
gr.Markdown("### Destination for your migrated dataset")
|
| 222 |
+
destination_hf_hub_repository = gr.Textbox(
|
| 223 |
+
label="Destination Hugging Face Repository",
|
| 224 |
+
placeholder="i.e. <hugging face username>/<repository_name>",
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
github_submit_btn = gr.Button("Migrate GitHub Dataset")
|
| 228 |
+
github_result = gr.Markdown(label="Summary", visible=True)
|
| 229 |
+
|
| 230 |
+
github_submit_btn.click(
|
| 231 |
+
push_to_hf,
|
| 232 |
+
[
|
| 233 |
+
source_github_repository,
|
| 234 |
+
destination_hf_hub_repository,
|
| 235 |
+
folder_in_github_repo,
|
| 236 |
+
],
|
| 237 |
+
[github_result],
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
with gr.Tab("Kaggle"):
|
| 241 |
+
gr.Markdown("### Source Kaggle Dataset")
|
| 242 |
+
gr.Markdown("Enter the Kaggle dataset name and file path")
|
| 243 |
+
source_kaggle_dataset = gr.Textbox(
|
| 244 |
+
lines=1,
|
| 245 |
+
label="Source Kaggle Dataset",
|
| 246 |
+
placeholder="username/dataset-name",
|
| 247 |
+
)
|
| 248 |
+
kaggle_file_path = gr.Textbox(
|
| 249 |
+
label="File path in dataset",
|
| 250 |
+
placeholder="e.g., train.csv",
|
| 251 |
+
info="Specify the file to migrate from the dataset",
|
| 252 |
+
)
|
| 253 |
+
|
| 254 |
+
gr.Markdown("### Destination for your migrated dataset")
|
| 255 |
+
kaggle_destination_hf_hub = gr.Textbox(
|
| 256 |
+
label="Destination Hugging Face Repository",
|
| 257 |
+
placeholder="i.e. <hugging face username>/<repository_name>",
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
kaggle_submit_btn = gr.Button("Migrate Kaggle Dataset")
|
| 261 |
+
kaggle_result = gr.Markdown(label="Summary", visible=True)
|
| 262 |
+
|
| 263 |
+
kaggle_submit_btn.click(
|
| 264 |
+
push_kaggle_to_hf,
|
| 265 |
+
[
|
| 266 |
+
source_kaggle_dataset,
|
| 267 |
+
kaggle_destination_hf_hub,
|
| 268 |
+
kaggle_file_path,
|
| 269 |
+
],
|
| 270 |
+
[kaggle_result],
|
| 271 |
+
)
|
| 272 |
+
|
| 273 |
gr.Markdown(
|
| 274 |
"""You should add a dataset card for your dataset to help people discover and understand your dataset. You can find instructions for creating a dataset card [here](https://huggingface.co/docs/datasets/dataset_card).
|
| 275 |
If you have any questions or feedback feel free to reach out to us on using the [Discussion tab](https://huggingface.co/spaces/librarian-bots/github-to-huggingface-dataset-migration-tool/discussions/1)"""
|
| 276 |
)
|
| 277 |
|
|
|
|
| 278 |
demo.launch()
|