#!/usr/bin/env python3 """ Beautiful custom timeline visualization for Transformers models using Flask. """ import glob import os import re import subprocess import sys import time import webbrowser from datetime import datetime from typing import Optional from flask import Flask, jsonify, render_template, request import transformers try: import yaml # type: ignore except Exception: # pragma: no cover yaml = None class TransformersTimelineParser: """Parser for extracting model release dates from Transformers documentation.""" def __init__(self, docs_dir: str): self.docs_dir = docs_dir self.models_cache = None self.tasks_cache = {} # Add transformers source directory to Python path to import auto mappings transformers_src = os.path.join(os.path.dirname(docs_dir), "..", "..", "src") if transformers_src not in sys.path: sys.path.insert(0, transformers_src) # Parse modalities dynamically; no fallback to static definitions parsed_modalities = self._parse_modalities_from_toctree() if not parsed_modalities: raise RuntimeError("Failed to parse modalities from docs toctree (_toctree.yml)") self.modalities = parsed_modalities def _parse_modalities_from_toctree(self) -> Optional[dict[str, dict[str, object]]]: """Parse model modalities and slugs from docs/source/en/_toctree.yml. Returns a dict with the same schema as self.modalities or None on failure. """ # Compute toctree path relative to provided docs_dir toctree_path = os.path.join(self.docs_dir, "..", "_toctree.yml") if not os.path.isfile(toctree_path): return None if yaml is None: return None with open(toctree_path, "r", encoding="utf-8") as f: data = yaml.safe_load(f) if not isinstance(data, list): return None # Locate API -> Models api_top = None for entry in data: if isinstance(entry, dict) and entry.get("title") == "API" and entry.get("sections"): api_top = entry break if api_top is None: def _dfs_find_api(node): if isinstance(node, dict) and node.get("title") == "API" and node.get("sections"): return node if isinstance(node, dict): for v in node.values(): found = _dfs_find_api(v) if found is not None: return found if isinstance(node, list): for v in node: found = _dfs_find_api(v) if found is not None: return found return None api_top = _dfs_find_api(data) if api_top is None: return None models_top = None for sec in api_top.get("sections", []): if isinstance(sec, dict) and sec.get("title") == "Models" and sec.get("sections"): models_top = sec break if models_top is None: def _dfs_find_models(node): if isinstance(node, dict) and node.get("title") == "Models" and node.get("sections"): return node if isinstance(node, dict): for v in node.values(): found = _dfs_find_models(v) if found is not None: return found if isinstance(node, list): for v in node: found = _dfs_find_models(v) if found is not None: return found return None models_top = _dfs_find_models(api_top) if models_top is None: return None # Helper to extract slugs from a section like "Text models" def extract_model_slugs(section_title: str) -> list[str]: result: list[str] = [] for sec in models_top.get("sections", []): if isinstance(sec, dict) and sec.get("title") == section_title: # Items may be nested under sections -> sections -> list of {local: model_doc/, title: ...} nested = sec.get("sections") or [] for sub in nested: if not isinstance(sub, dict): continue # Direct list: if "local" in sub: local = sub.get("local") if isinstance(local, str) and local.startswith("model_doc/"): result.append(local.split("/", 1)[1]) # Or deeper nesting for leaf in sub.get("sections", []) if isinstance(sub.get("sections"), list) else []: local = leaf.get("local") if isinstance(local, str) and local.startswith("model_doc/"): result.append(local.split("/", 1)[1]) return result text_models = extract_model_slugs("Text models") vision_models = extract_model_slugs("Vision models") audio_models = extract_model_slugs("Audio models") video_models = extract_model_slugs("Video models") multimodal_models = extract_model_slugs("Multimodal models") rl_models = extract_model_slugs("Reinforcement learning models") ts_models = extract_model_slugs("Time series models") graph_models = extract_model_slugs("Graph models") # Basic validation: require at least some categories to be non-empty if not any([text_models, vision_models, audio_models, video_models, multimodal_models]): return None # Preserve existing names and colors return { "text": {"name": "Text Models", "color": "#F59E0B", "models": text_models}, "vision": {"name": "Vision Models", "color": "#06B6D4", "models": vision_models}, "audio": {"name": "Audio Models", "color": "#8B5CF6", "models": audio_models}, "video": {"name": "Video Models", "color": "#EC4899", "models": video_models}, "multimodal": {"name": "Multimodal Models", "color": "#10B981", "models": multimodal_models}, "reinforcement": {"name": "Reinforcement Learning", "color": "#EF4444", "models": rl_models}, "timeseries": {"name": "Time Series Models", "color": "#F97316", "models": ts_models}, "graph": {"name": "Graph Models", "color": "#6B7280", "models": graph_models}, } def get_model_modality(self, model_name: str) -> dict[str, str]: """Determine the modality category for a given model.""" for modality_key, modality_info in self.modalities.items(): if model_name in modality_info["models"]: return {"key": modality_key, "name": modality_info["name"], "color": modality_info["color"]} # Default to text if not found (most common) return {"key": "text", "name": "Text Models", "color": "#F59E0B"} def parse_release_date_from_file(self, file_path: str) -> Optional[dict[str, str]]: """Parse the release date line from a model documentation file.""" try: with open(file_path, "r", encoding="utf-8") as f: content = f.read() # Extract model name from file path (always available) model_name = os.path.basename(file_path).replace(".md", "") # Initialize default values release_date = None transformers_date = None # Focus on the end of the sentence - the Transformers addition date is what matters most pattern = ( r"\*This model was released on (.+?) and added to Hugging Face Transformers on (\d{4}-\d{2}-\d{2})\.\*" ) match = re.search(pattern, content) if match: release_date = match.group(1).strip() transformers_date = match.group(2) # Validate the Transformers date (this is the critical one for our timeline) try: datetime.strptime(transformers_date, "%Y-%m-%d") except ValueError: return None # Handle release_date - could be "None" or an actual date if release_date.lower() == "none": release_date = None else: # Try to validate as a date, but don't fail if it's not try: datetime.strptime(release_date, "%Y-%m-%d") except ValueError: # Keep the original value even if it's not a valid date pass else: # No release date pattern found - warn and skip (ignore auto.md intentionally) base = os.path.basename(file_path) if base != "auto.md": print(f"āš ļø Warning: No release/addition dates found in {file_path}; skipping.") return None # Get modality information modality = self.get_model_modality(model_name) # Extract model description description = self.extract_model_description(content) # Get supported tasks/pipelines tasks = self.get_model_tasks(model_name) return { "model_name": model_name, "file_path": file_path, "release_date": release_date, "transformers_date": transformers_date, "modality": modality["key"], "modality_name": modality["name"], "modality_color": modality["color"], "description": description, "tasks": tasks, } except Exception as e: print(f"Error processing {file_path}: {e}") return None def extract_model_description(self, content: str) -> str: """Extract the first 1000 characters of model description, excluding HTML/XML tags.""" try: # Remove HTML/XML tags content_no_tags = re.sub(r"<[^>]+>", "", content) # Find the start of the actual description (after the initial metadata) # Look for the first substantial paragraph after the initial lines lines = content_no_tags.split("\n") description_start = 0 # Skip initial metadata, imports, and short lines for i, line in enumerate(lines): stripped = line.strip() if ( len(stripped) > 50 and not stripped.startswith("#") and not stripped.startswith("*This model was released") and not stripped.startswith("
āˆ’
100%
+
Loading timeline...
-
Total Models
-
Displayed Models
-
Date Range
""" with open(os.path.join(template_dir, "timeline.html"), "w", encoding="utf-8") as f: f.write(html_content) def open_browser(): """Open the browser after a short delay.""" time.sleep(1.5) webbrowser.open("http://localhost:5000") def main(): """Main function to run the timeline app.""" print("šŸ¤— Transformers Models Timeline") print("=" * 50) # Create templates create_timeline_template() # Check if docs directory exists if not os.path.exists(docs_dir): print(f"āŒ Error: Documentation directory not found at {docs_dir}") print("Please update the 'docs_dir' variable in the script.") return # Parse models to check if any are found models = parser.parse_all_model_dates() if not models: print(f"āš ļø Warning: No models found with release dates in {docs_dir}") else: print(f"āœ… Found {len(models)} models with release dates") # Run Flask app try: app.run(host="0.0.0.0", port=7860, debug=False) except KeyboardInterrupt: print("\nšŸ‘‹ Timeline server stopped") if __name__ == "__main__": main()