| import os | |
| import json | |
| import subprocess | |
| import tempfile | |
| from pathlib import Path | |
| from datetime import datetime, timezone | |
| from huggingface_hub import HfApi | |
| from modular_graph_and_candidates import ( | |
| build_graph_json, | |
| generate_html, | |
| build_timeline_json, | |
| generate_timeline_html | |
| ) | |
| REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers") | |
| CACHE_REPO = "Molbap/hf_cached_embeds_log" | |
| THRESH = float(os.getenv("SIM_THRESHOLD", "0.50")) | |
| MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"} | |
| SIM_METHOD = os.getenv("SIM_METHOD", "jaccard") | |
| def main(): | |
| print(f"Building cache for {REPO_URL}") | |
| print(f"Config: threshold={THRESH}, multimodal={MULTIMODAL}, method={SIM_METHOD}") | |
| tmp = Path(tempfile.mkdtemp()) | |
| print(f"Working in {tmp}") | |
| print("Cloning repository...") | |
| subprocess.check_call([ | |
| "git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo") | |
| ]) | |
| sha = subprocess.check_output([ | |
| "git", "rev-parse", "HEAD" | |
| ], cwd=tmp / "repo", text=True).strip() | |
| print(f"Repository SHA: {sha}") | |
| repo_path = tmp / "repo" | |
| print("Building graph...") | |
| graph = build_graph_json( | |
| transformers_dir=repo_path, | |
| threshold=THRESH, | |
| multimodal=MULTIMODAL, | |
| sim_method=SIM_METHOD, | |
| ) | |
| print("Building timeline...") | |
| timeline = build_timeline_json( | |
| transformers_dir=repo_path, | |
| threshold=THRESH, | |
| multimodal=MULTIMODAL, | |
| sim_method=SIM_METHOD, | |
| ) | |
| print("Generating HTML...") | |
| graph_html = generate_html(graph) | |
| timeline_html = generate_timeline_html(timeline) | |
| print(f"Uploading to {CACHE_REPO}...") | |
| api = HfApi() | |
| key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}" | |
| latest = { | |
| "sha": sha, | |
| "updated_utc": datetime.now(timezone.utc).isoformat(), | |
| "defaults": { | |
| "sim_method": SIM_METHOD, | |
| "threshold": THRESH, | |
| "multimodal": MULTIMODAL | |
| }, | |
| "paths": { | |
| "graph_json": f"graph/{key}.json", | |
| "graph_html": f"graph/{key}.html", | |
| "timeline_json": f"timeline/{key}.json", | |
| "timeline_html": f"timeline/{key}.html", | |
| }, | |
| } | |
| files_to_upload = [ | |
| (f"graph/{key}.json", json.dumps(graph, separators=(',', ':'))), | |
| (f"graph/{key}.html", graph_html), | |
| (f"timeline/{key}.json", json.dumps(timeline, separators=(',', ':'))), | |
| (f"timeline/{key}.html", timeline_html), | |
| ("latest.json", json.dumps(latest, separators=(',', ':'))), | |
| ] | |
| for path_in_repo, content in files_to_upload: | |
| temp_file = tmp / "upload_temp" | |
| temp_file.write_text(content, encoding="utf-8") | |
| api.upload_file( | |
| path_or_fileobj=str(temp_file), | |
| path_in_repo=path_in_repo, | |
| repo_id=CACHE_REPO, | |
| commit_message=f"Cache update {sha[:7]} - {SIM_METHOD} t={THRESH} m={int(MULTIMODAL)}" | |
| ) | |
| print(f"Uploaded {path_in_repo}") | |
| print(f"Successfully uploaded cache for {key}") | |
| if __name__ == "__main__": | |
| main() |