File size: 3,125 Bytes
6d106b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import json
import subprocess
import tempfile
from pathlib import Path
from datetime import datetime, timezone
from huggingface_hub import HfApi

from modular_graph_and_candidates import (
    build_graph_json,
    generate_html,
    build_timeline_json,
    generate_timeline_html
)

REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
CACHE_REPO = "Molbap/hf_cached_embeds_log"
THRESH = float(os.getenv("SIM_THRESHOLD", "0.50"))
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1", "true", "True", "YES", "yes"}
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")

def main():
    print(f"Building cache for {REPO_URL}")
    print(f"Config: threshold={THRESH}, multimodal={MULTIMODAL}, method={SIM_METHOD}")

    tmp = Path(tempfile.mkdtemp())
    print(f"Working in {tmp}")

    print("Cloning repository...")
    subprocess.check_call([
        "git", "clone", "--depth", "1", REPO_URL, str(tmp / "repo")
    ])

    sha = subprocess.check_output([
        "git", "rev-parse", "HEAD"
    ], cwd=tmp / "repo", text=True).strip()

    print(f"Repository SHA: {sha}")

    repo_path = tmp / "repo"

    print("Building graph...")
    graph = build_graph_json(
        transformers_dir=repo_path,
        threshold=THRESH,
        multimodal=MULTIMODAL,
        sim_method=SIM_METHOD,
    )

    print("Building timeline...")
    timeline = build_timeline_json(
        transformers_dir=repo_path,
        threshold=THRESH,
        multimodal=MULTIMODAL,
        sim_method=SIM_METHOD,
    )

    print("Generating HTML...")
    graph_html = generate_html(graph)
    timeline_html = generate_timeline_html(timeline)

    print(f"Uploading to {CACHE_REPO}...")

    api = HfApi()

    key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"

    latest = {
        "sha": sha,
        "updated_utc": datetime.now(timezone.utc).isoformat(),
        "defaults": {
            "sim_method": SIM_METHOD,
            "threshold": THRESH,
            "multimodal": MULTIMODAL
        },
        "paths": {
            "graph_json": f"graph/{key}.json",
            "graph_html": f"graph/{key}.html",
            "timeline_json": f"timeline/{key}.json",
            "timeline_html": f"timeline/{key}.html",
        },
    }

    files_to_upload = [
        (f"graph/{key}.json", json.dumps(graph, separators=(',', ':'))),
        (f"graph/{key}.html", graph_html),
        (f"timeline/{key}.json", json.dumps(timeline, separators=(',', ':'))),
        (f"timeline/{key}.html", timeline_html),
        ("latest.json", json.dumps(latest, separators=(',', ':'))),
    ]

    for path_in_repo, content in files_to_upload:
        temp_file = tmp / "upload_temp"
        temp_file.write_text(content, encoding="utf-8")

        api.upload_file(
            path_or_fileobj=str(temp_file),
            path_in_repo=path_in_repo,
            repo_id=CACHE_REPO,
            commit_message=f"Cache update {sha[:7]} - {SIM_METHOD} t={THRESH} m={int(MULTIMODAL)}"
        )
        print(f"Uploaded {path_in_repo}")

    print(f"Successfully uploaded cache for {key}")

if __name__ == "__main__":
    main()