Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import json | |
| from io import BytesIO | |
| import requests | |
| import re | |
| from openpyxl import Workbook | |
| def sanitize_value(val): | |
| """ | |
| Convert complex types to a string and remove illegal characters | |
| that Excel does not accept. | |
| """ | |
| if isinstance(val, bytes): | |
| try: | |
| s = val.decode("utf-8", errors="replace") | |
| except Exception: | |
| s = str(val) | |
| # Remove control characters (except newline and tab if desired) | |
| return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', s) | |
| elif isinstance(val, str): | |
| return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', val) | |
| elif isinstance(val, (dict, list)): | |
| return re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F]', '', str(val)) | |
| else: | |
| return val | |
| def dataset_converter(input_file, conversion_type, parquet_url): | |
| # Initialize variables for file data and extension | |
| file_bytes = None | |
| file_name = None | |
| file_extension = None | |
| # Read the input file if provided | |
| if input_file is not None: | |
| try: | |
| file_bytes = input_file.read() | |
| file_name = input_file.name | |
| except AttributeError: | |
| file_name = input_file | |
| with open(file_name, "rb") as f: | |
| file_bytes = f.read() | |
| file_extension = file_name.lower().split('.')[-1] | |
| # Conversion: CSV to Parquet | |
| if conversion_type == "CSV to Parquet": | |
| if input_file is None or file_extension != "csv": | |
| raise ValueError("For CSV to Parquet conversion, please upload a CSV file. π") | |
| df = pd.read_csv(BytesIO(file_bytes)) | |
| output_file = "output.parquet" | |
| df.to_parquet(output_file, index=False) | |
| converted_format = "Parquet" | |
| preview_str = df.head(10).to_string(index=False) | |
| # Conversion: Parquet to CSV | |
| elif conversion_type == "Parquet to CSV": | |
| if input_file is None or file_extension != "parquet": | |
| raise ValueError("For Parquet to CSV conversion, please upload a Parquet file. π") | |
| df = pd.read_parquet(BytesIO(file_bytes)) | |
| output_file = "output.csv" | |
| df.to_csv(output_file, index=False) | |
| converted_format = "CSV" | |
| preview_str = df.head(10).to_string(index=False) | |
| # Conversion: CSV to JSONL | |
| elif conversion_type == "CSV to JSONL": | |
| if input_file is None or file_extension != "csv": | |
| raise ValueError("For CSV to JSONL conversion, please upload a CSV file. π") | |
| df = pd.read_csv(BytesIO(file_bytes), encoding='latin1') | |
| output_file = "metadata.jsonl" | |
| total_data = [] | |
| for index, row in df.iterrows(): | |
| data = {} | |
| file_name_val = None # Initialize file_name for each row | |
| for column in df.columns: | |
| if column == 'file_name': | |
| file_name_val = row[column] | |
| data[column] = row[column] | |
| row_data = {"file_name": file_name_val, "ground_truth": json.dumps(data)} | |
| total_data.append(row_data) | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| for row_data in total_data: | |
| f.write(json.dumps(row_data) + '\n') | |
| converted_format = "JSONL" | |
| preview_str = df.head(10).to_string(index=False) | |
| # Conversion: Parquet to JSONL | |
| elif conversion_type == "Parquet to JSONL": | |
| if input_file is not None: | |
| df = pd.read_parquet(BytesIO(file_bytes)) | |
| elif parquet_url: | |
| response = requests.get(parquet_url) | |
| response.raise_for_status() | |
| df = pd.read_parquet(BytesIO(response.content)) | |
| file_name = "from_url.parquet" | |
| else: | |
| raise ValueError("For Parquet to JSONL conversion, please upload a file or provide a URL. π") | |
| output_file = "output.jsonl" | |
| def recursive_sanitize(val): | |
| if isinstance(val, bytes): | |
| return val.decode("utf-8", errors="replace") | |
| elif isinstance(val, dict): | |
| return {k: recursive_sanitize(v) for k, v in val.items()} | |
| elif isinstance(val, list): | |
| return [recursive_sanitize(item) for item in val] | |
| else: | |
| return val | |
| records = df.to_dict(orient="records") | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| for record in records: | |
| sanitized_record = recursive_sanitize(record) | |
| f.write(json.dumps(sanitized_record, ensure_ascii=False) + "\n") | |
| converted_format = "JSONL" | |
| preview_str = df.head(10).to_string(index=False) | |
| # Conversion: Parquet to XLS | |
| elif conversion_type == "Parquet to XLS": | |
| if input_file is not None: | |
| df = pd.read_parquet(BytesIO(file_bytes)) | |
| elif parquet_url: | |
| response = requests.get(parquet_url) | |
| response.raise_for_status() | |
| df = pd.read_parquet(BytesIO(response.content)) | |
| file_name = "from_url.parquet" | |
| else: | |
| raise ValueError("For Parquet to XLS conversion, please upload a file or provide a URL. π") | |
| output_file = "output.xlsx" | |
| wb = Workbook(write_only=True) | |
| ws = wb.create_sheet() | |
| ws.append(list(df.columns)) | |
| for row in df.itertuples(index=False, name=None): | |
| sanitized_row = [sanitize_value(cell) for cell in row] | |
| ws.append(sanitized_row) | |
| wb.save(output_file) | |
| converted_format = "XLS" | |
| preview_str = df.head(10).to_string(index=False) | |
| else: | |
| raise ValueError("Invalid conversion type selected. β οΈ") | |
| info_message = ( | |
| f"Input file: {file_name if file_name is not None else 'N/A'}\n" | |
| f"Converted file format: {converted_format}\n\n" | |
| f"Preview (Top 10 Rows):\n{preview_str}\n\n" | |
| "Community: https://discord.gg/openfreeai π" | |
| ) | |
| return output_file, info_message | |
| # Custom CSS for a modern and sleek look | |
| custom_css = """ | |
| body { | |
| background-color: #f4f4f4; | |
| font-family: 'Helvetica Neue', Arial, sans-serif; | |
| } | |
| .gradio-container { | |
| max-width: 1000px; | |
| margin: 40px auto; | |
| padding: 20px; | |
| background-color: #ffffff; | |
| border-radius: 12px; | |
| box-shadow: 0 8px 16px rgba(0,0,0,0.1); | |
| } | |
| h1, h2 { | |
| color: #333333; | |
| } | |
| .gradio-input, .gradio-output { | |
| margin-bottom: 20px; | |
| } | |
| .gradio-button { | |
| background-color: #4CAF50 !important; | |
| color: white !important; | |
| border: none !important; | |
| padding: 10px 20px !important; | |
| font-size: 16px !important; | |
| border-radius: 6px !important; | |
| cursor: pointer; | |
| } | |
| .gradio-button:hover { | |
| background-color: #45a049 !important; | |
| } | |
| """ | |
| with gr.Blocks(css=custom_css, title="Datasets Convertor") as demo: | |
| gr.Markdown("# Datasets Convertor π") | |
| gr.Markdown( | |
| "Upload a CSV or Parquet file (or provide a Parquet file URL for Parquet to JSONL/XLS conversion) " | |
| "and select the conversion type. The app converts the file to the desired format and displays a preview of the top 10 rows. β¨" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_file = gr.File(label="Upload CSV or Parquet File π") | |
| with gr.Column(scale=1): | |
| conversion_type = gr.Radio( | |
| choices=["CSV to Parquet", "Parquet to CSV", "CSV to JSONL", "Parquet to JSONL", "Parquet to XLS"], | |
| label="Conversion Type π" | |
| ) | |
| parquet_url = gr.Textbox(label="Parquet File URL (Optional) π", placeholder="Enter URL if not uploading a file") | |
| convert_button = gr.Button("Convert β‘", elem_classes=["gradio-button"]) | |
| with gr.Row(): | |
| output_file = gr.File(label="Converted File πΎ") | |
| preview = gr.Textbox(label="Preview (Top 10 Rows) π", lines=15) | |
| convert_button.click( | |
| fn=dataset_converter, | |
| inputs=[input_file, conversion_type, parquet_url], | |
| outputs=[output_file, preview] | |
| ) | |
| gr.Markdown("**Join our Community:** [https://discord.gg/openfreeai](https://discord.gg/openfreeai) π€") | |
| demo.launch() | |