DataVerse

Running

App Files Files Community

evijit HF Staff commited on May 11

Commit

98b7de8

verified ·

1 Parent(s): 723854d

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -204

app.py CHANGED Viewed

@@ -2,13 +2,11 @@ import json
 import gradio as gr
 import pandas as pd
 import plotly.express as px
-import pyarrow.parquet as pq
 import os
-import requests
-from io import BytesIO
 import numpy as np
-# Define pipeline tags from the provided code
 PIPELINE_TAGS = [
  'text-generation',
  'text-to-image',
@@ -59,61 +57,63 @@ MODEL_SIZE_RANGES = {
     "XX-Large (>50GB)": (50, float('inf'))
 }
-# Filter functions for tags - keeping the same from provided code
-def is_audio_speech(model_dict):
-    tags = model_dict.get("tags", [])
-    pipeline_tag = model_dict.get("pipeline_tag", "")
     return (pipeline_tag and ("audio" in pipeline_tag.lower() or "speech" in pipeline_tag.lower())) or \
            any("audio" in tag.lower() for tag in tags) or \
            any("speech" in tag.lower() for tag in tags)
-def is_music(model_dict):
-    tags = model_dict.get("tags", [])
     return any("music" in tag.lower() for tag in tags)
-def is_robotics(model_dict):
-    tags = model_dict.get("tags", [])
     return any("robot" in tag.lower() for tag in tags)
-def is_biomed(model_dict):
-    tags = model_dict.get("tags", [])
     return any("bio" in tag.lower() for tag in tags) or \
            any("medic" in tag.lower() for tag in tags)
-def is_timeseries(model_dict):
-    tags = model_dict.get("tags", [])
     return any("series" in tag.lower() for tag in tags)
-def is_science(model_dict):
-    tags = model_dict.get("tags", [])
     return any("science" in tag.lower() and "bigscience" not in tag for tag in tags)
-def is_video(model_dict):
-    tags = model_dict.get("tags", [])
     return any("video" in tag.lower() for tag in tags)
-def is_image(model_dict):
-    tags = model_dict.get("tags", [])
     return any("image" in tag.lower() for tag in tags)
-def is_text(model_dict):
-    tags = model_dict.get("tags", [])
     return any("text" in tag.lower() for tag in tags)
 # Add model size filter function
-def is_in_size_range(model_dict, size_range):
     if size_range is None:
         return True
     min_size, max_size = MODEL_SIZE_RANGES[size_range]
-    # Get model size in GB from safetensors total (if available)
-    safetensors = model_dict.get("safetensors", None)
-    if safetensors and isinstance(safetensors, dict) and "total" in safetensors:
-        # Convert bytes to GB
-        size_gb = safetensors["total"] / (1024 * 1024 * 1024)
-        return min_size <= size_gb < max_size
     return False
@@ -198,7 +198,8 @@ def create_treemap(treemap_data, count_by, title=None):
         treemap_data,
         path=["root", "organization", "id"],
         values=count_by,
-        title=title or f"HuggingFace Models - {count_by.capitalize()} by Organization"
     )
     # Update layout
@@ -214,133 +215,34 @@ def create_treemap(treemap_data, count_by, title=None):
     return fig
-def download_with_progress(url, progress=None):
-    """Download a file with progress tracking"""
-    try:
-        response = requests.get(url, stream=True)
-        total_size = int(response.headers.get('content-length', 0))
-        block_size = 1024  # 1 Kibibyte
-        data = BytesIO()
-        if total_size == 0:
-            # If content length is unknown, we can't show accurate progress
-            if progress is not None:
-                progress(0, "Starting download...")
-            for chunk in response.iter_content(block_size):
-                data.write(chunk)
-                if progress is not None:
-                    progress(0, f"Downloading... (unknown size)")
-        else:
-            downloaded = 0
-            for chunk in response.iter_content(block_size):
-                downloaded += len(chunk)
-                data.write(chunk)
-                if progress is not None:
-                    percent = int(100 * downloaded / total_size)
-                    progress(percent / 100, f"Downloading... {percent}% ({downloaded//(1024*1024)}MB/{total_size//(1024*1024)}MB)")
-        return data.getvalue()
-    except Exception as e:
-        print(f"Error in download_with_progress: {e}")
-        raise
-def update_progress(progress_obj, value, description):
-    """Safely update progress with error handling"""
-    try:
-        if progress_obj is not None:
-            progress_obj(value, description)
-    except Exception as e:
-        print(f"Error updating progress: {e}")
-def download_and_process_models(progress=None):
-    """Download and process the models data from HuggingFace dataset with progress tracking"""
-    try:
-        # Create a cache directory
-        if not os.path.exists('data'):
-            os.makedirs('data')
-        # Check if we have cached data
-        if os.path.exists('data/processed_models.parquet'):
-            update_progress(progress, 1.0, "Loading from cache...")
-            print("Loading models from cache...")
-            df = pd.read_parquet('data/processed_models.parquet')
-            return df
-        # URL to the models.parquet file
-        url = "https://huggingface.co/datasets/cfahlgren1/hub-stats/resolve/main/models.parquet"
-        update_progress(progress, 0.0, "Starting download...")
-        print(f"Downloading models data from {url}...")
-        try:
-            # Download with progress tracking
-            file_content = download_with_progress(url, progress)
-            update_progress(progress, 0.9, "Parsing parquet file...")
-            # Read the parquet file
-            table = pq.read_table(BytesIO(file_content))
-            df = table.to_pandas()
-            print(f"Downloaded {len(df)} models")
-            update_progress(progress, 0.95, "Processing data...")
-            # Process the safetensors column if it's a string (JSON)
-            if 'safetensors' in df.columns:
-                def parse_safetensors(val):
-                    if isinstance(val, str):
-                        try:
-                            return json.loads(val)
-                        except:
-                            return None
-                    return val
-                df['safetensors'] = df['safetensors'].apply(parse_safetensors)
-            # Process the tags column if needed
-            if 'tags' in df.columns and len(df) > 0 and not isinstance(df['tags'].iloc[0], list):
-                def parse_tags(val):
-                    if isinstance(val, str):
-                        try:
-                            return json.loads(val)
-                        except:
-                            return []
-                    return val if isinstance(val, list) else []
-                df['tags'] = df['tags'].apply(parse_tags)
-            # Cache the processed data
-            update_progress(progress, 0.98, "Saving to cache...")
-            df.to_parquet('data/processed_models.parquet')
-            update_progress(progress, 1.0, "Data ready!")
-            return df
-        except Exception as download_error:
-            print(f"Download failed: {download_error}")
-            update_progress(progress, 0.5, "Download failed, generating sample data...")
-            return create_sample_data(progress)
-    except Exception as e:
-        print(f"Error downloading or processing data: {e}")
-        update_progress(progress, 1.0, "Using sample data (error occurred)")
-        # Return sample data for testing if real data unavailable
-        return create_sample_data(progress)
-def create_sample_data(progress=None):
-    """Create sample data for testing when real data is unavailable"""
-    print("Creating sample data for testing...")
-    if progress:
-        progress(0.3, "Creating sample data...")
-    # Sample organizations
     orgs = ['openai', 'meta', 'google', 'microsoft', 'anthropic', 'nvidia', 'huggingface',
             'deepseek-ai', 'stability-ai', 'mistralai', 'cerebras', 'databricks', 'together',
-            'facebook', 'amazon', 'deepmind', 'cohere', 'nvidia', 'bigscience', 'eleutherai']
     # Common model name formats
     model_name_patterns = [
@@ -366,13 +268,10 @@ def create_sample_data(progress=None):
     variants = ["chat", "instruct", "base", "v1.0", "v2", "beta", "turbo", "fast", "xl", "xxl"]
     # Generate sample data
-    data = []
-    total_models = sum(np.random.randint(5, 20) for _ in orgs)
-    models_created = 0
     for org_idx, org in enumerate(orgs):
-        # Create 5-20 models per organization
-        num_models = np.random.randint(5, 20)
         for i in range(num_models):
             # Create realistic model name
@@ -428,11 +327,11 @@ def create_sample_data(progress=None):
             # Generate downloads and likes (weighted by org position for variety)
             # Earlier orgs get more downloads to make the visualization interesting
             popularity_factor = (len(orgs) - org_idx) / len(orgs)  # 1.0 to 0.0
-            base_downloads = 1000 * (10 ** (2 * popularity_factor))
             downloads = int(base_downloads * np.random.uniform(0.3, 3.0))
             likes = int(downloads * np.random.uniform(0.01, 0.1))  # 1-10% like ratio
-            # Generate model size (in bytes for safetensors total)
             # Model size should correlate somewhat with the size in the name
             size_indicator = 1
             for s in ["70b", "13b", "7b", "3b", "2b", "1b", "large", "huge", "xl", "xxl"]:
@@ -440,53 +339,31 @@ def create_sample_data(progress=None):
                     size_indicator = float(s.replace("b", "")) if s[0].isdigit() else 3
                     break
-            # Size in GB, then convert to bytes
-            size_gb = np.random.uniform(0.1, 2.0) * size_indicator
-            if size_gb > 50:  # Cap at 100GB
-                size_gb = min(size_gb, 100)
-            size_bytes = int(size_gb * 1e9)
             # Create model entry
             model = {
                 "id": model_id,
                 "downloads": downloads,
-                "downloadsAllTime": int(downloads * np.random.uniform(1.5, 3.0)),  # All-time higher than recent
                 "likes": likes,
                 "pipeline_tag": pipeline_tag,
                 "tags": tags,
-                "safetensors": {"total": size_bytes}
             }
-            data.append(model)
-            models_created += 1
-            if progress and i % 5 == 0:
-                progress(0.3 + 0.6 * (models_created / total_models), f"Created {models_created}/{total_models} sample models...")
-    # Convert to DataFrame
-    df = pd.DataFrame(data)
-    if progress:
-        progress(0.95, "Finalizing sample data...")
-    return df
 # Create Gradio interface
 with gr.Blocks() as demo:
     models_data = gr.State()  # To store loaded data
-    # Loading screen components
-    with gr.Row(visible=True) as loading_screen:
-        with gr.Column(scale=1):
-            gr.Markdown("""
-                # HuggingFace Models TreeMap Visualization
-                Loading data... This might take a moment.
-            """)
-            data_loading_progress = gr.Progress()
-    # Main application components (initially hidden)
-    with gr.Row(visible=False) as main_app:
         gr.Markdown("""
             # HuggingFace Models TreeMap Visualization
@@ -496,11 +373,11 @@ with gr.Blocks() as demo:
             The treemap visualizes models grouped by organization, with the size of each box representing the selected metric (downloads or likes).
         """)
-    with gr.Row(visible=False) as control_panel:
         with gr.Column(scale=1):
             count_by_dropdown = gr.Dropdown(
                 label="Metric",
-                choices=["downloads", "downloadsAllTime", "likes"],
                 value="downloads",
                 info="Select the metric to determine box sizes"
             )
@@ -532,7 +409,7 @@ with gr.Blocks() as demo:
                 label="Model Size Filter",
                 choices=["None"] + list(MODEL_SIZE_RANGES.keys()),
                 value="None",
-                info="Filter models by their size (in safetensors['total'])"
             )
             top_k_slider = gr.Slider(
@@ -623,17 +500,11 @@ with gr.Blocks() as demo:
         outputs=[tag_filter_dropdown, pipeline_filter_dropdown]
     )
-    def load_data_with_progress(progress=gr.Progress()):
-        """Load data with progress tracking and update UI visibility"""
-        data_df = download_and_process_models(progress)
-        # Return both the data and the visibility updates
-        return data_df, gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
-    # Load data once at startup with progress bar
     demo.load(
-        fn=load_data_with_progress,
         inputs=[],
-        outputs=[models_data, loading_screen, main_app, control_panel]
     )
     # Button click event to generate plot

 import gradio as gr
 import pandas as pd
 import plotly.express as px
 import os
 import numpy as np
+import io
+# Define pipeline tags
 PIPELINE_TAGS = [
  'text-generation',
  'text-to-image',
     "XX-Large (>50GB)": (50, float('inf'))
 }
+# Filter functions for tags
+def is_audio_speech(row):
+    tags = row.get("tags", [])
+    pipeline_tag = row.get("pipeline_tag", "")
     return (pipeline_tag and ("audio" in pipeline_tag.lower() or "speech" in pipeline_tag.lower())) or \
            any("audio" in tag.lower() for tag in tags) or \
            any("speech" in tag.lower() for tag in tags)
+def is_music(row):
+    tags = row.get("tags", [])
     return any("music" in tag.lower() for tag in tags)
+def is_robotics(row):
+    tags = row.get("tags", [])
     return any("robot" in tag.lower() for tag in tags)
+def is_biomed(row):
+    tags = row.get("tags", [])
     return any("bio" in tag.lower() for tag in tags) or \
            any("medic" in tag.lower() for tag in tags)
+def is_timeseries(row):
+    tags = row.get("tags", [])
     return any("series" in tag.lower() for tag in tags)
+def is_science(row):
+    tags = row.get("tags", [])
     return any("science" in tag.lower() and "bigscience" not in tag for tag in tags)
+def is_video(row):
+    tags = row.get("tags", [])
     return any("video" in tag.lower() for tag in tags)
+def is_image(row):
+    tags = row.get("tags", [])
     return any("image" in tag.lower() for tag in tags)
+def is_text(row):
+    tags = row.get("tags", [])
     return any("text" in tag.lower() for tag in tags)
 # Add model size filter function
+def is_in_size_range(row, size_range):
     if size_range is None:
         return True
     min_size, max_size = MODEL_SIZE_RANGES[size_range]
+    # Get model size in GB from params column
+    if "params" in row and pd.notna(row["params"]):
+        try:
+            # Convert to GB (assuming params are in bytes or scientific notation)
+            size_gb = float(row["params"]) / (1024 * 1024 * 1024)
+            return min_size <= size_gb < max_size
+        except (ValueError, TypeError):
+            return False
     return False
         treemap_data,
         path=["root", "organization", "id"],
         values=count_by,
+        title=title or f"HuggingFace Models - {count_by.capitalize()} by Organization",
+        color_discrete_sequence=px.colors.qualitative.Plotly
     )
     # Update layout
     return fig
+def load_models_csv():
+    # Read the CSV file
+    df = pd.read_csv('models.csv')
+    # Process the tags column
+    def process_tags(tags_str):
+        if pd.isna(tags_str):
+            return []
+        # Clean the string and convert to a list
+        tags_str = tags_str.strip("[]").replace("'", "")
+        tags = [tag.strip() for tag in tags_str.split() if tag.strip()]
+        return tags
+    df['tags'] = df['tags'].apply(process_tags)
+    # Add more sample data for better visualization
+    add_sample_data(df)
+    return df
+def add_sample_data(df):
+    """Add more sample data to make the visualization more interesting"""
+    # Top organizations to include
     orgs = ['openai', 'meta', 'google', 'microsoft', 'anthropic', 'nvidia', 'huggingface',
             'deepseek-ai', 'stability-ai', 'mistralai', 'cerebras', 'databricks', 'together',
+            'facebook', 'amazon', 'deepmind', 'cohere', 'bigscience', 'eleutherai']
     # Common model name formats
     model_name_patterns = [
     variants = ["chat", "instruct", "base", "v1.0", "v2", "beta", "turbo", "fast", "xl", "xxl"]
     # Generate sample data
+    sample_data = []
     for org_idx, org in enumerate(orgs):
+        # Create 5-10 models per organization
+        num_models = np.random.randint(5, 11)
         for i in range(num_models):
             # Create realistic model name
             # Generate downloads and likes (weighted by org position for variety)
             # Earlier orgs get more downloads to make the visualization interesting
             popularity_factor = (len(orgs) - org_idx) / len(orgs)  # 1.0 to 0.0
+            base_downloads = 10000 * (10 ** (2 * popularity_factor))
             downloads = int(base_downloads * np.random.uniform(0.3, 3.0))
             likes = int(downloads * np.random.uniform(0.01, 0.1))  # 1-10% like ratio
+            # Generate model size (in bytes for params)
             # Model size should correlate somewhat with the size in the name
             size_indicator = 1
             for s in ["70b", "13b", "7b", "3b", "2b", "1b", "large", "huge", "xl", "xxl"]:
                     size_indicator = float(s.replace("b", "")) if s[0].isdigit() else 3
                     break
+            # Size in bytes
+            params = int(np.random.uniform(0.5, 2.0) * size_indicator * 1e9)
             # Create model entry
             model = {
                 "id": model_id,
+                "author": org,
                 "downloads": downloads,
                 "likes": likes,
                 "pipeline_tag": pipeline_tag,
                 "tags": tags,
+                "params": params
             }
+            sample_data.append(model)
+    # Convert sample data to DataFrame and append to original
+    sample_df = pd.DataFrame(sample_data)
+    return pd.concat([df, sample_df], ignore_index=True)
 # Create Gradio interface
 with gr.Blocks() as demo:
     models_data = gr.State()  # To store loaded data
+    with gr.Row():
         gr.Markdown("""
             # HuggingFace Models TreeMap Visualization
             The treemap visualizes models grouped by organization, with the size of each box representing the selected metric (downloads or likes).
         """)
+    with gr.Row():
         with gr.Column(scale=1):
             count_by_dropdown = gr.Dropdown(
                 label="Metric",
+                choices=["downloads", "likes"],
                 value="downloads",
                 info="Select the metric to determine box sizes"
             )
                 label="Model Size Filter",
                 choices=["None"] + list(MODEL_SIZE_RANGES.keys()),
                 value="None",
+                info="Filter models by their size (using params column)"
             )
             top_k_slider = gr.Slider(
         outputs=[tag_filter_dropdown, pipeline_filter_dropdown]
     )
+    # Load data once at startup
     demo.load(
+        fn=load_models_csv,
         inputs=[],
+        outputs=[models_data]
     )
     # Button click event to generate plot