Molbap's picture
Molbap HF Staff
setup cache
6d106b8
raw
history blame
3.13 kB
import os
import json
import subprocess
import tempfile
from pathlib import Path
from datetime import datetime, timezone
from huggingface_hub import HfApi
from modular_graph_and_candidates import (
build_graph_json,
generate_html,
build_timeline_json,
generate_timeline_html
)
REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
CACHE_REPO = "Molbap/hf_cached_embeds_log"
THRESH = float(os.getenv("SIM_THRESHOLD", "0.50"))
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"}
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
def main():
print(f"Building cache for {REPO_URL}")
print(f"Config: threshold={THRESH}, multimodal={MULTIMODAL}, method={SIM_METHOD}")
tmp = Path(tempfile.mkdtemp())
print(f"Working in {tmp}")
print("Cloning repository...")
subprocess.check_call([
"git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")
])
sha = subprocess.check_output([
"git", "rev-parse", "HEAD"
], cwd=tmp / "repo", text=True).strip()
print(f"Repository SHA: {sha}")
repo_path = tmp / "repo"
print("Building graph...")
graph = build_graph_json(
transformers_dir=repo_path,
threshold=THRESH,
multimodal=MULTIMODAL,
sim_method=SIM_METHOD,
)
print("Building timeline...")
timeline = build_timeline_json(
transformers_dir=repo_path,
threshold=THRESH,
multimodal=MULTIMODAL,
sim_method=SIM_METHOD,
)
print("Generating HTML...")
graph_html = generate_html(graph)
timeline_html = generate_timeline_html(timeline)
print(f"Uploading to {CACHE_REPO}...")
api = HfApi()
key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
latest = {
"sha": sha,
"updated_utc": datetime.now(timezone.utc).isoformat(),
"defaults": {
"sim_method": SIM_METHOD,
"threshold": THRESH,
"multimodal": MULTIMODAL
},
"paths": {
"graph_json": f"graph/{key}.json",
"graph_html": f"graph/{key}.html",
"timeline_json": f"timeline/{key}.json",
"timeline_html": f"timeline/{key}.html",
},
}
files_to_upload = [
(f"graph/{key}.json", json.dumps(graph, separators=(',', ':'))),
(f"graph/{key}.html", graph_html),
(f"timeline/{key}.json", json.dumps(timeline, separators=(',', ':'))),
(f"timeline/{key}.html", timeline_html),
("latest.json", json.dumps(latest, separators=(',', ':'))),
]
for path_in_repo, content in files_to_upload:
temp_file = tmp / "upload_temp"
temp_file.write_text(content, encoding="utf-8")
api.upload_file(
path_or_fileobj=str(temp_file),
path_in_repo=path_in_repo,
repo_id=CACHE_REPO,
commit_message=f"Cache update {sha[:7]} - {SIM_METHOD} t={THRESH} m={int(MULTIMODAL)}"
)
print(f"Uploaded {path_in_repo}")
print(f"Successfully uploaded cache for {key}")
if __name__ == "__main__":
main()