Molbap HF Staff commited on
Commit
3bfd58a
·
1 Parent(s): b1eaa18
dist/distill.bundle.js ADDED
The diff for this file is too large to render. See raw diff
 
dist/distill.bundle.js.map ADDED
The diff for this file is too large to render. See raw diff
 
dist/fragments/attention-visualizer.html ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div style="border: 1px solid #e2e8f0; border-radius: 8px; background: white; margin: 1.5rem 0;">
2
+ <div style="padding: 1rem; border-bottom: 1px solid #e2e8f0; background: #f8f9fa;">
3
+ <h4 style="margin: 0 0 0.5rem 0; color: #495057;">🔍 Attention Mask Visualizer</h4>
4
+ <p style="margin: 0; font-size: 0.9em; color: #6c757d;">
5
+ Visualize attention patterns in transformer models. This helps debug attention mask issues.
6
+ </p>
7
+ </div>
8
+
9
+ <div style="padding: 1rem;">
10
+ <div style="display: grid; grid-template-columns: 1fr auto; gap: 1rem; align-items: start; margin-bottom: 1rem;">
11
+ <div>
12
+ <label style="display: block; font-weight: 600; margin-bottom: 0.5rem; color: #374151;">Model:</label>
13
+ <select id=model-select style="width: 100%; padding: 0.5rem; border: 1px solid #d1d5db; border-radius: 6px; background: white;">
14
+ <option value=openai-community/gpt2>openai-community/gpt2</option>
15
+ <option value=google/gemma-2-2b>google/gemma-2-2b</option>
16
+ <option value=microsoft/DialoGPT-small>microsoft/DialoGPT-small</option>
17
+ </select>
18
+ </div>
19
+
20
+ <div>
21
+ <label style="display: block; font-weight: 600; margin-bottom: 0.5rem; color: #374151;">Action:</label>
22
+ <button id=visualize-btn style="padding: 0.5rem 1rem; background: #3b82f6; color: white; border: none; border-radius: 6px; cursor: pointer; font-weight: 500;">
23
+ 🚀 Visualize
24
+ </button>
25
+ </div>
26
+ </div>
27
+
28
+ <div style="margin-bottom: 1rem;">
29
+ <label style="display: block; font-weight: 600; margin-bottom: 0.5rem; color: #374151;">Prompt:</label>
30
+ <textarea id=prompt-input style="width: 100%; padding: 0.75rem; border: 1px solid #d1d5db; border-radius: 6px; resize: vertical; font-family: monospace; font-size: 0.9em;" rows=3 placeholder="You are an assistant. Make sure you print me."></textarea>
31
+ </div>
32
+
33
+ <div id=attention-output style="min-height: 200px; background: #f8f9fa; border: 1px solid #e9ecef; border-radius: 6px; padding: 1rem;">
34
+ <div style="text-align: center; color: #6c757d; font-style: italic;">
35
+ Click "Visualize" to generate attention visualization
36
+ </div>
37
+ </div>
38
+ </div>
39
+
40
+ <div style="padding: 1rem; border-top: 1px solid #e2e8f0; background: #f8f9fa; font-size: 0.9em; color: #6c757d;">
41
+ <strong>Note:</strong> This is a demonstration. In the original Gradio app, this would use GPU processing with ZeroGPU
42
+ to generate real attention visualizations from transformer models.
43
+ </div>
44
+ </div>
45
+
46
+ <script>document.addEventListener("DOMContentLoaded",function(){let e=document.getElementById("model-select"),t=document.getElementById("prompt-input"),n=document.getElementById("visualize-btn"),i=document.getElementById("attention-output");t.value="You are an assistant. Make sure you print me.",n.addEventListener("click",function(){let o=e.value,r=t.value.trim();if(!r){i.innerHTML='<div style="color: #e53e3e;">Please enter a prompt</div>';return}n.disabled=!0,n.textContent="Processing...",i.innerHTML='<div style="text-align: center; color: #6c757d;"><em>Generating attention visualization...</em></div>',setTimeout(()=>{let e=r.split(" ").slice(0,8),t='<div style="margin-bottom: 1rem;"><strong>Model:</strong> '+o+"</div>";t+='<div style="margin-bottom: 1rem;"><strong>Tokens:</strong> '+e.join(" • ")+'</div><div><strong>Attention Matrix (Layer 0, Head 0):</strong></div><table style="margin-top: 0.5rem; border-collapse: collapse; font-family: monospace; font-size: 0.8em;">';for(let n=0;n<e.length;n++){t+="<tr>";for(let n=0;n<e.length;n++){let e=Math.random(),n=`rgba(59, 130, 246, ${e})`;t+=`<td style="border: 1px solid #ddd; padding: 4px; background: ${n}; text-align: center; min-width: 40px;">${e.toFixed(2)}</td>`}t+="</tr>"}i.innerHTML=t+='</table><div style="margin-top: 1rem; font-size: 0.9em; color: #6c757d;"><em>Darker blue = higher attention weight</em></div>',n.disabled=!1,n.textContent="\uD83D\uDE80 Visualize"},2e3)})})</script>
dist/fragments/d3-graph.html ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class=interactive-demo>
2
+ <div class=demo-header>
3
+ <h3>🔗 Model Dependency Graph</h3>
4
+ </div>
5
+ <div class=demo-content>
6
+ <iframe src=static/d3_dependency_graph.html width=100% height=600px frameborder=0 style="border-radius: 8px; background: white;"></iframe>
7
+ </div>
8
+ <div class=demo-footer>
9
+ Interactive dependency graph showing real relationships between Transformers models. 🟡 Base models (HuggingFace logo), 🔵 Derived modular models. Click and drag to explore!
10
+ </div>
11
+ </div>
12
+
dist/fragments/glm-compare.html ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div class=code-compare style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin: 1.5rem 0;">
2
+ <div class=code-column style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden;">
3
+ <div class=code-header style="background: #f8f9fa; padding: 0.75rem 1rem; font-weight: 600; color: #495057; border-bottom: 1px solid #e2e8f0;">
4
+ modular_glm.py
5
+ </div>
6
+ <pre style="margin: 0; padding: 1rem; background: #ffffff; overflow-x: auto; font-size: 0.9em;"><code class=language-python>class GlmMLP(Phi3MLP):
7
+ pass
8
+
9
+ class GlmAttention(LlamaAttention):
10
+ def __init__(self, config, layer_idx=None):
11
+ super().__init__(config, layer_idx)
12
+ self.o_proj = nn.Linear(
13
+ config.num_attention_heads * self.head_dim,
14
+ config.hidden_size,
15
+ bias=False
16
+ )
17
+
18
+ class GlmForCausalLM(LlamaForCausalLM):
19
+ pass</code></pre>
20
+ </div>
21
+
22
+ <div class=code-column style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden;">
23
+ <div class=code-header style="background: #f8f9fa; padding: 0.75rem 1rem; font-weight: 600; color: #495057; border-bottom: 1px solid #e2e8f0;">
24
+ modeling_glm.py (auto-expanded)
25
+ </div>
26
+ <pre style="margin: 0; padding: 1rem; background: #ffffff; overflow-x: auto; font-size: 0.9em; max-height: 400px;"><code class=language-python>class GlmMLP(nn.Module):
27
+ def __init__(self, config):
28
+ super().__init__()
29
+ self.config = config
30
+ self.gate_up_proj = nn.Linear(
31
+ config.hidden_size,
32
+ 2 * config.intermediate_size,
33
+ bias=False
34
+ )
35
+ self.down_proj = nn.Linear(
36
+ config.intermediate_size,
37
+ config.hidden_size,
38
+ bias=False
39
+ )
40
+ self.activation_fn = ACT2FN[config.hidden_act]
41
+
42
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
43
+ up_states = self.gate_up_proj(hidden_states)
44
+ gate, up_states = up_states.chunk(2, dim=-1)
45
+ up_states = up_states * self.activation_fn(gate)
46
+ return self.down_proj(up_states)
47
+
48
+
49
+ class GlmAttention(nn.Module):
50
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
51
+
52
+ def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
53
+ super().__init__()
54
+ self.config = config
55
+ self.layer_idx = layer_idx
56
+ self.head_dim = getattr(
57
+ config, "head_dim",
58
+ config.hidden_size // config.num_attention_heads
59
+ )
60
+ self.num_key_value_groups = (
61
+ config.num_attention_heads // config.num_key_value_heads
62
+ )
63
+ self.scaling = self.head_dim**-0.5
64
+ self.attention_dropout = config.attention_dropout
65
+ self.is_causal = True
66
+
67
+ self.q_proj = nn.Linear(
68
+ config.hidden_size,
69
+ config.num_attention_heads * self.head_dim,
70
+ bias=config.attention_bias
71
+ )
72
+ self.k_proj = nn.Linear(
73
+ config.hidden_size,
74
+ config.num_key_value_heads * self.head_dim,
75
+ bias=config.attention_bias
76
+ )
77
+ self.v_proj = nn.Linear(
78
+ config.hidden_size,
79
+ config.num_key_value_heads * self.head_dim,
80
+ bias=config.attention_bias
81
+ )
82
+ self.o_proj = nn.Linear(
83
+ config.num_attention_heads * self.head_dim,
84
+ config.hidden_size,
85
+ bias=False
86
+ )
87
+
88
+ def forward(
89
+ self,
90
+ hidden_states: torch.Tensor,
91
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
92
+ attention_mask: Optional[torch.Tensor],
93
+ past_key_value: Optional[Cache] = None,
94
+ cache_position: Optional[torch.LongTensor] = None,
95
+ **kwargs: Unpack[FlashAttentionKwargs],
96
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
97
+ input_shape = hidden_states.shape[:-1]
98
+ hidden_shape = (*input_shape, -1, self.head_dim)
99
+
100
+ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
101
+ key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
102
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
103
+
104
+ cos, sin = position_embeddings
105
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
106
+
107
+ if past_key_value is not None:
108
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
109
+ key_states, value_states = past_key_value.update(
110
+ key_states, value_states, self.layer_idx, cache_kwargs
111
+ )
112
+
113
+ attention_interface: Callable = eager_attention_forward
114
+ if self.config._attn_implementation != "eager":
115
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
116
+
117
+ attn_output, attn_weights = attention_interface(
118
+ self, query_states, key_states, value_states,
119
+ attention_mask, dropout=0.0 if not self.training else self.attention_dropout,
120
+ scaling=self.scaling, **kwargs,
121
+ )
122
+
123
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
124
+ attn_output = self.o_proj(attn_output)
125
+ return attn_output, attn_weights
126
+
127
+
128
+ @use_kernel_forward_from_hub("RMSNorm")
129
+ class GlmRMSNorm(nn.Module):
130
+ def __init__(self, hidden_size, eps=1e-6):
131
+ super().__init__()
132
+ self.weight = nn.Parameter(torch.ones(hidden_size))
133
+ self.variance_epsilon = eps
134
+
135
+ def forward(self, hidden_states):
136
+ input_dtype = hidden_states.dtype
137
+ hidden_states = hidden_states.to(torch.float32)
138
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
139
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
140
+ return self.weight * hidden_states.to(input_dtype)
141
+
142
+ # ... (many more classes and functions would follow)</code></pre>
143
+ </div>
144
+ </div>
145
+
146
+ <p style="text-align: center; font-style: italic; color: #6c757d; margin-top: 1rem;">
147
+ <strong>Left:</strong> Clean modular definition with inheritance.
148
+ <strong>Right:</strong> Auto-expanded version with all inherited functionality visible.
149
+ </p>
dist/fragments/memory-profiler.html ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div style="border: 1px solid #e2e8f0; border-radius: 8px; background: white; margin: 1.5rem 0;">
2
+ <div style="padding: 1rem; border-bottom: 1px solid #e2e8f0; background: #f8f9fa;">
3
+ <h4 style="margin: 0 0 0.5rem 0; color: #495057;">🚀 CUDA Warmup Efficiency Benchmark</h4>
4
+ <p style="margin: 0; font-size: 0.9em; color: #6c757d;">
5
+ Real CUDA warmup benchmarking with actual Transformers models. Measure the performance impact of the caching_allocator_warmup function.
6
+ </p>
7
+ </div>
8
+
9
+ <div style="padding: 1rem;">
10
+ <iframe src=https://molbap-cuda-warmup-transformers.hf.space width=100% height=800px frameborder=0 style="border-radius: 8px; background: white;"></iframe>
11
+ </div>
12
+
13
+ <div style="padding: 1rem; border-top: 1px solid #e2e8f0; background: #f8f9fa; font-size: 0.9em; color: #6c757d;">
14
+ Real CUDA warmup benchmarking with actual Transformers models. Measure the performance impact of the <code>caching_allocator_warmup</code> function at <code>transformers/src/transformers/modeling_utils.py:6186</code>. This interactive tool loads models twice - once with warmup disabled and once with warmup enabled - to demonstrate the significant loading time improvements.
15
+ </div>
16
+ </div>
dist/fragments/terminal.html ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div style="background: #f8f9fa; border: 1px solid #e9ecef; border-radius: 8px; padding: 1rem; margin: 1.5rem 0;">
2
+ <h4 style="margin-top: 0; color: #495057;">Interactive Terminal</h4>
3
+ <div style="background: #2d3748; color: #e2e8f0; padding: 1rem; border-radius: 6px; font-family: 'Consolas', 'Monaco', monospace;">
4
+ <div style="margin-bottom: 1rem;">
5
+ <input type=text id=terminal-input placeholder="python -c 'import torch; print(torch.__version__)'" style="width: calc(100% - 80px); padding: 0.5rem; background: #1a202c; border: 1px solid #4a5568; color: #e2e8f0; border-radius: 4px;">
6
+ <button id=terminal-run style="width: 70px; padding: 0.5rem; margin-left: 8px; background: #3182ce; color: white; border: none; border-radius: 4px; cursor: pointer;">Run</button>
7
+ </div>
8
+ <pre id=terminal-output style="background: #1a202c; padding: 1rem; border-radius: 4px; min-height: 100px; margin: 0; overflow-x: auto;">$ Ready to run commands...</pre>
9
+ </div>
10
+ <p style="font-size: 0.9em; color: #6c757d; margin-top: 0.5rem;">
11
+ <em>Note: This is a simulated terminal. In the original Gradio app, this would execute real Python commands with proper security restrictions.</em>
12
+ </p>
13
+ </div>
14
+
15
+ <script>document.addEventListener("DOMContentLoaded",function(){let e=document.getElementById("terminal-input"),t=document.getElementById("terminal-run"),n=document.getElementById("terminal-output");function o(){let t=e.value.trim();t&&(n.textContent=`$ ${t}
16
+ Simulated output for: ${t}
17
+
18
+ This would execute the command in the original app.
19
+ Example outputs:
20
+ - torch version: 2.0.1+cu117
21
+ - import checks: Success
22
+ - memory info: Available`)}t.addEventListener("click",o),e.addEventListener("keypress",function(e){"Enter"===e.key&&o()})})</script>
dist/hf-logo.svg ADDED
dist/index.html ADDED
@@ -0,0 +1,615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html>
3
+ <head>
4
+ <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script>
5
+ <script src="main.bundle.js" type="module" fetchpriority="low" defer></script>
6
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/components/prism-core.min.js"></script>
7
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/plugins/autoloader/prism-autoloader.min.js"></script>
8
+ <script src="https://d3js.org/d3.v7.min.js"></script>
9
+ <meta name="viewport" content="width=device-width, initial-scale=1">
10
+ <meta charset="utf8">
11
+ <title>Transformers Feature Showcase</title>
12
+ <link rel="stylesheet" href="style.css">
13
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/prism/1.29.0/themes/prism.min.css">
14
+ </head>
15
+ <body>
16
+ <d-front-matter>
17
+ <script id='distill-front-matter' type="text/json">{
18
+ "title": "Transformers Feature Showcase",
19
+ "description": "An interactive demonstration of transformers library features and design philosophy.",
20
+ "published": "Aug 21, 2025",
21
+ "authors": [{"author": "Pablo Montalvo", "authorURL": "https://huggingface.co/Molbap"}]
22
+ }</script>
23
+ </d-front-matter>
24
+ <d-title>
25
+ <h1>Transformers Feature Showcase</h1>
26
+ <p>An interactive demonstration of transformers library features and design philosophy.</p>
27
+ </d-title>
28
+ <d-byline></d-byline>
29
+ <d-article>
30
+ <d-contents>
31
+ <nav role="navigation" class="l-text figcaption">
32
+ <div class="toc-header"><span class="toc-title">Table of Contents</span></div>
33
+ <div class="toc-content">
34
+ <div><a href="#introduction">Introduction</a></div>
35
+ <div style="margin-left: 1.2em;"><a href="#what-you-will-learn">What you will learn</a></div>
36
+ <div><a href="#source-of-truth">0. Source of truth</a></div>
37
+ <div><a href="#one-model-one-file">1. One model, one file</a></div>
38
+ <div><a href="#code-is-product">2. Code is product</a></div>
39
+ <div><a href="#standardize-dont-abstract">3. Standardize, don't abstract</a></div>
40
+ <div><a href="#do-repeat-yourself">4. DRY* (DO Repeat Yourself)</a></div>
41
+ <div><a href="#minimal-user-api">5. Minimal user API</a></div>
42
+ <div><a href="#backwards-compatibility">6. Backwards compatibility</a></div>
43
+ <div><a href="#consistent-public-surface">7. Consistent public surface</a></div>
44
+ <div><a href="#modular">Going modular</a></div>
45
+ <div><a href="#attention-classes">External Attention classes</a></div>
46
+ <div><a href="#encoders-ftw">Encoders win!</a></div>
47
+ </div>
48
+ </nav>
49
+ </d-contents>
50
+ <h2>Introduction</h2>
51
+ <p>The <code>transformers</code> library, built with <code>PyTorch</code>, supports all state-of-the-art LLMs, many VLMs, task-specific vision language models, video models, audio models, table models, classical encoders, to a global count of almost 400 models. The name of the library itself is mostly majority driven as many models are not even transformers architectures, like Mamba/RWKV. Regardless, each of these is wrought by the research and engineering team that created them, then harmonized into a now famous interface, and callable with a simple <code>.from_pretrained</code>. Inference and training are supported. The library supports ML courses, cookbooks, and several thousands other open-source libraries depend on it. All models are tested as part of a daily CI ensuring their preservation and reproducibility. Most importantly, it is open-source and has been written by the community for a large part.</p>
52
+ <p>The ML wave has not stopped, there’s more and more models being added. <code>Transformers</code> is widely used, and we read the feedback that users post. Whether it’s about a function that had 300+ keyword arguments, duplicated code and helpers, and mentions of <code>Copied from ... </code> everywhere, along with optimisation concerns. Text-only models are relatively tamed, but multimodal models remain to be harmonized.</p>
53
+ <p>Here we will dissect what is the design philosophy of transformers, as a continuation from the existing older <a href="https://huggingface.co/docs/transformers/en/philosophy">philosophy</a> page, and an accompanying <a href="https://huggingface.co/blog/transformers-design-philosophy">blog post from 2022</a> . Some time ago I dare not say how long, we discussed with transformers maintainers about the state of things. A lot of recent developments were satisfactory, but if we were only talking about these, self-congratulation would be the only goalpost. Reflecting on this philosophy now, as models pile up, is essential and will drive new developments.</p>
54
+ <h3>What you will learn</h3>
55
+ <p>Every reader, whether an OSS maintainer, power user, or casual fine-tuner, will walk away knowing how to reason about the <code>transformers</code> code base, how to use it better, how to meaningfully contribute to it.
56
+ This will also showcase new features you might have missed so you’ll be up-to-date.</p>
57
+ <p>So, what are the principles of <code>transformers</code>? We will try to summarize the foundations on which we’ve built everything, and write the “tenets” of the library. They behave like <em>software interfaces</em>, hence it is crucial that they are explicitly written down. However opinionated they are, they have evolved over time.</p>
58
+ <div class="tenet-list">
59
+ <ol>
60
+ <li class="tenet">
61
+ <a id="source-of-truth"></a>
62
+ <strong>Source of Truth</strong>
63
+ <p>We should be a source of truth for all model definitions. This is not a tenet, but something that still guides our decisions. Model implementations should be reliable, reproducible, and faithful to the original performances.</p>
64
+ <em>This overarching guideline ensures quality and reproducibility across all models in the library.</em>
65
+ </li>
66
+ <li class="tenet">
67
+ <a id="one-model-one-file"></a>
68
+ <strong>One Model, One File</strong>
69
+ <p>All inference (and most of training, loss is separate, not a part of model) logic visible, top‑to‑bottom.</p>
70
+ <em>Every model should be completely understandable by reading a single file from top to bottom.</em>
71
+ </li>
72
+ <li class="tenet">
73
+ <a id="code-is-product"></a>
74
+ <strong>Code is Product</strong>
75
+ <p>Optimize for reading, diffing, and tweaking, our users are power users. Variables can be explicit, full words, even several words, readability is primordial.</p>
76
+ <em>Code quality matters as much as functionality - optimize for human readers, not just computers.</em>
77
+ </li>
78
+ <li class="tenet">
79
+ <a id="standardize-dont-abstract"></a>
80
+ <strong>Standardize, Don't Abstract</strong>
81
+ <p>If it's model behavior, keep it in the file; abstractions only for generic infra.</p>
82
+ <em>Model-specific logic belongs in the model file, not hidden behind abstractions.</em>
83
+ </li>
84
+ <li class="tenet">
85
+ <a id="do-repeat-yourself"></a>
86
+ <strong>DRY* (DO Repeat Yourself)</strong>
87
+ <p>Copy when it helps users; keep successors in sync without centralizing behavior.</p>
88
+ <p><strong>Amendment:</strong> With the introduction and global adoption of <a href="#modular">modular</a> transformers, we do not repeat any logic in the modular files, but end user files remain faithful to the original tenet.</p>
89
+ <em>Strategic duplication can improve readability and maintainability when done thoughtfully.</em>
90
+ </li>
91
+ <li class="tenet">
92
+ <a id="minimal-user-api"></a>
93
+ <strong>Minimal User API</strong>
94
+ <p>Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. We want the least amount of codepaths. Reading should be obvious, configurations should be obvious.</p>
95
+ <em>Keep the public interface simple and predictable - users should know what to expect.</em>
96
+ </li>
97
+ <li class="tenet">
98
+ <a id="backwards-compatibility"></a>
99
+ <strong>Backwards Compatibility</strong>
100
+ <p>Evolve by additive standardization, <strong>never</strong> break public APIs.</p>
101
+ <p><strong>Note:</strong> Some models are showing almost no use, we also stopped adding new features for non-torch frameworks. Still, we adapt to models existing on the hub.</p>
102
+ <em>Once something is public, it stays public - evolution through addition, not breaking changes.</em>
103
+ </li>
104
+ <li class="tenet">
105
+ <a id="consistent-public-surface"></a>
106
+ <strong>Consistent Public Surface</strong>
107
+ <p>Same argument names, same outputs, hidden states and attentions exposed, enforced by tests.</p>
108
+ <em>All models should feel familiar - consistent interfaces reduce cognitive load.</em>
109
+ </li>
110
+ <li class="tenet">
111
+ <a id="modular-toolbox"></a>
112
+ <strong>Modular Toolbox (Not Framework)</strong>
113
+ <p>We ARE a toolbox. What we are not is a framework: you should not be FORCED to rewrite every modeling, but it is <em>better</em> for your model to be able to inherit from PreTrainedModel and have enabled TensorParallel, from_pretrained, sharding, push_to_hub, loss, as well as PEFT/TRL/SGLang/vLLM.</p>
114
+ <em>This is the largest change. Provide tools and utilities, but don't force users into a rigid framework.</em>
115
+ </li>
116
+ </ol>
117
+ </div>
118
+ <p>When a PR is merged, it is because the contribution is worthwhile, and that the <code>transformers</code> team finds the design of the contribution to be aligned with what is above.</p>
119
+ <p>Does all the code in the library follow strictly these tenets? No. The library is a gigantic house with connected nooks, corridors, crannies everywhere built by thousands of different workers. We <em>try</em> to make it so all the code added is inline, lest we break <a href="#backwards-compatibility">backwards compatibility</a>.</p>
120
+ <p>For instance, one function essential to the implementation of <a href="https://huggingface.co/papers/2104.09864">Rotary Positional Embeddings</a> is identical in 70 <code>modeling_&lt;file&gt;.py</code> across <code>src/transformers/models/.</code> Why keep it? Because removing it would make those files unloadable checkpoints rather than self-contained blueprints. We <a href="#do-repeat-yourself">do repeat ourselves</a>.</p>
121
+ <pre><code class="language-python">def rotate_half(x):
122
+ &quot;&quot;&quot;Rotates half the hidden dims of the input.&quot;&quot;&quot;
123
+ x1 = x[..., : x.shape[-1] // 2]
124
+ x2 = x[..., x.shape[-1] // 2 :]
125
+ return torch.cat((-x2, x1), dim=-1)
126
+ </code></pre>
127
+ <p>You can use a script such as [[top_methods.py]] to look at all methods of a given name across your codebase and look at their differences and similarities, that’s what I did (+ a hash to avoid quadraticity).</p>
128
+ <p>So… why keep it in all modeling files? Because if we were to remove it, the model would not work anymore. Think of the modeling files as a car (I know, what a novel metaphor! But, it works out.). All manual transmission cars have a clutch, but we want each <em>view</em> of one of our cars to be able to function. Remove the clutch, you can’t drive. Remove the doors, might be uncomfortable but you’ll get there. So doors can go, but you <em>have</em> to keep the clutch, even though you know perfectly how it works.</p>
129
+ <p>As I was looking for things to improve and make better, it’s one of the iterations I attempted: a function is almost everywhere the same, let’s import it from some common file? But no! Goes against</p>
130
+ <h2><a id="modular"></a> Going modular</h2>
131
+ <p>However, both of these works were already pointing at some drawbacks, which have been iteratively addressed. <a href="https://huggingface.co/docs/transformers/en/modular_transformers">Transformers has gone modular</a> , allowing a form of inheritance without breaking <a href="#one-model-one-file">One model, One file</a>. If you’re familiar with this, you can <a href="#%5Eattention-classes">skip this section</a> and go to the next one.</p>
132
+ <p>We amended the principle of <a href="#do-repeat-yourself">DRY*</a> by removing progressively all pieces of code that were “copied from” another file.</p>
133
+ <p>It is explained in details in the documentation above, but overall it works like this, you define a <code>modular_</code> file that can inherit from <em>any function across all other modeling, configuration and processor files</em>:</p>
134
+ <summary>Auto-generated modeling code</summary>
135
+ <p><div class=code-compare style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin: 1.5rem 0;">
136
+ <div class=code-column style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden;">
137
+ <div class=code-header style="background: #f8f9fa; padding: 0.75rem 1rem; font-weight: 600; color: #495057; border-bottom: 1px solid #e2e8f0;">
138
+ modular_glm.py
139
+ </div>
140
+ <pre style="margin: 0; padding: 1rem; background: #ffffff; overflow-x: auto; font-size: 0.9em;"><code class=language-python>class GlmMLP(Phi3MLP):
141
+ pass
142
+
143
+ class GlmAttention(LlamaAttention):
144
+ def __init__(self, config, layer_idx=None):
145
+ super().__init__(config, layer_idx)
146
+ self.o_proj = nn.Linear(
147
+ config.num_attention_heads * self.head_dim,
148
+ config.hidden_size,
149
+ bias=False
150
+ )
151
+
152
+ class GlmForCausalLM(LlamaForCausalLM):
153
+ pass</code></pre>
154
+ </div>
155
+
156
+ <div class=code-column style="border: 1px solid #e2e8f0; border-radius: 8px; overflow: hidden;">
157
+ <div class=code-header style="background: #f8f9fa; padding: 0.75rem 1rem; font-weight: 600; color: #495057; border-bottom: 1px solid #e2e8f0;">
158
+ modeling_glm.py (auto-expanded)
159
+ </div>
160
+ <pre style="margin: 0; padding: 1rem; background: #ffffff; overflow-x: auto; font-size: 0.9em; max-height: 400px;"><code class=language-python>class GlmMLP(nn.Module):
161
+ def __init__(self, config):
162
+ super().__init__()
163
+ self.config = config
164
+ self.gate_up_proj = nn.Linear(
165
+ config.hidden_size,
166
+ 2 * config.intermediate_size,
167
+ bias=False
168
+ )
169
+ self.down_proj = nn.Linear(
170
+ config.intermediate_size,
171
+ config.hidden_size,
172
+ bias=False
173
+ )
174
+ self.activation_fn = ACT2FN[config.hidden_act]
175
+
176
+ def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
177
+ up_states = self.gate_up_proj(hidden_states)
178
+ gate, up_states = up_states.chunk(2, dim=-1)
179
+ up_states = up_states * self.activation_fn(gate)
180
+ return self.down_proj(up_states)
181
+
182
+
183
+ class GlmAttention(nn.Module):
184
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
185
+
186
+ def __init__(self, config: GlmConfig, layer_idx: Optional[int] = None):
187
+ super().__init__()
188
+ self.config = config
189
+ self.layer_idx = layer_idx
190
+ self.head_dim = getattr(
191
+ config, "head_dim",
192
+ config.hidden_size // config.num_attention_heads
193
+ )
194
+ self.num_key_value_groups = (
195
+ config.num_attention_heads // config.num_key_value_heads
196
+ )
197
+ self.scaling = self.head_dim**-0.5
198
+ self.attention_dropout = config.attention_dropout
199
+ self.is_causal = True
200
+
201
+ self.q_proj = nn.Linear(
202
+ config.hidden_size,
203
+ config.num_attention_heads * self.head_dim,
204
+ bias=config.attention_bias
205
+ )
206
+ self.k_proj = nn.Linear(
207
+ config.hidden_size,
208
+ config.num_key_value_heads * self.head_dim,
209
+ bias=config.attention_bias
210
+ )
211
+ self.v_proj = nn.Linear(
212
+ config.hidden_size,
213
+ config.num_key_value_heads * self.head_dim,
214
+ bias=config.attention_bias
215
+ )
216
+ self.o_proj = nn.Linear(
217
+ config.num_attention_heads * self.head_dim,
218
+ config.hidden_size,
219
+ bias=False
220
+ )
221
+
222
+ def forward(
223
+ self,
224
+ hidden_states: torch.Tensor,
225
+ position_embeddings: Tuple[torch.Tensor, torch.Tensor],
226
+ attention_mask: Optional[torch.Tensor],
227
+ past_key_value: Optional[Cache] = None,
228
+ cache_position: Optional[torch.LongTensor] = None,
229
+ **kwargs: Unpack[FlashAttentionKwargs],
230
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
231
+ input_shape = hidden_states.shape[:-1]
232
+ hidden_shape = (*input_shape, -1, self.head_dim)
233
+
234
+ query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
235
+ key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
236
+ value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
237
+
238
+ cos, sin = position_embeddings
239
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
240
+
241
+ if past_key_value is not None:
242
+ cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
243
+ key_states, value_states = past_key_value.update(
244
+ key_states, value_states, self.layer_idx, cache_kwargs
245
+ )
246
+
247
+ attention_interface: Callable = eager_attention_forward
248
+ if self.config._attn_implementation != "eager":
249
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
250
+
251
+ attn_output, attn_weights = attention_interface(
252
+ self, query_states, key_states, value_states,
253
+ attention_mask, dropout=0.0 if not self.training else self.attention_dropout,
254
+ scaling=self.scaling, **kwargs,
255
+ )
256
+
257
+ attn_output = attn_output.reshape(*input_shape, -1).contiguous()
258
+ attn_output = self.o_proj(attn_output)
259
+ return attn_output, attn_weights
260
+
261
+
262
+ @use_kernel_forward_from_hub("RMSNorm")
263
+ class GlmRMSNorm(nn.Module):
264
+ def __init__(self, hidden_size, eps=1e-6):
265
+ super().__init__()
266
+ self.weight = nn.Parameter(torch.ones(hidden_size))
267
+ self.variance_epsilon = eps
268
+
269
+ def forward(self, hidden_states):
270
+ input_dtype = hidden_states.dtype
271
+ hidden_states = hidden_states.to(torch.float32)
272
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
273
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
274
+ return self.weight * hidden_states.to(input_dtype)
275
+
276
+ # ... (many more classes and functions would follow)</code></pre>
277
+ </div>
278
+ </div>
279
+
280
+ <p style="text-align: center; font-style: italic; color: #6c757d; margin-top: 1rem;">
281
+ <strong>Left:</strong> Clean modular definition with inheritance.
282
+ <strong>Right:</strong> Auto-expanded version with all inherited functionality visible.
283
+ </p></p>
284
+ <h2><a id="attention-classes"></a> External Attention classes</h2>
285
+ <p>A chronological iteration over <a href="#modular">modular</a>, and a big improvement in terms of readabilty, was to remove the various attention-backend-specific attention classes across the repository. Before, we were adding specific torch operations for each backend (sdpa, flash-attention iterations, flex attention) but it wasn’t a <a href="#minimal-user-api">minimal user api</a>.</p>
286
+ <p>What will forever stay in the modeling code is the <code>eager_attention_forward</code> because it is a core part of the modeling,</p>
287
+ <pre><code class="language-python">attention_interface: Callable = eager_attention_forward
288
+ if self.config._attn_implementation != &quot;eager&quot;:
289
+ attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
290
+ </code></pre>
291
+ <p>We often read and understand that <code>kwargs</code> are criticized, and we are typing them however we can, but we cannot enforce them all the time because other libraries such as vLLM don’'t use the same kwargs.</p>
292
+ <p>It is a strength of the new attention interface, where it can be plugged in various backends, because most of the signature is not enforced. We INFORM but do not ENFORCE. That way, the current system is a <a href="#minimal-user-api">minimal user api</a>.</p>
293
+ <p>For a better <em>information</em>, we plan to use <code>python</code>features such as <code>Annotated</code> for example, to inform users of what we expect typically in an argument. That way, higher-level information could be included directly in the type annotations, telling for instance the expected dimensions and contents of a tensor.</p>
294
+ <h2><a id="simpler-tensor-parallelism"></a> Simpler Tensor Parallelism</h2>
295
+ <p>We want to touch minimally to the modeling code, and only modify it when <em>architectural changes</em> are involved. For instance, for tensor parallelism, we instead now specify a simple <code>tp_plan</code>.</p>
296
+ <h2><a id="layers-attentions-caches"></a> Layers, attentions and caches</h2>
297
+ <p>With th</p>
298
+ <h2><a id="community-kernels"></a>Community Kernels</h2>
299
+ <p>The same principle extends to normalization, activation, and other hot paths. The model defines <strong>semantics</strong>; a kernel defines <strong>how</strong> to execute them faster. We annotate the module to borrow a community‑provided forward, keeping a <a href="#consistent-public-surface">consistent public surface</a></p>
300
+ <pre><code class="language-python">@use_kernel_forward_from_hub(&quot;RMSNorm&quot;)
301
+ class GlmRMSNorm(nn.Module):
302
+ ...
303
+ </code></pre>
304
+ <p>Plus, this opened another angle of contribution for the community. People who are GPU whisperers can check on the <a href="https://huggingface.co/blog/hello-hf-kernels">kernel community blog post</a> to learn more about it!</p>
305
+ <h2>The good modularity</h2>
306
+ <p>Now, we have a form of inheritance in our codebase. Some models become standards, and model contributors are given the opportunity to <em>define standards</em>. Pushing the boundaries of scientific knowledge can translate into the boundaries of engineering if this effort is made, and we’re striving for it.</p>
307
+ <p>My capacity for abstraction is not that great, compared to other computer scientists and engineers: I need to look at little doodles and drawings, especially when components pile up.</p>
308
+ <p>So I wanted to take a look at the current <strong>state of modularity</strong> across the repository. How many models are defined using components of others?</p>
309
+ <p>To get this graph, I used the heuristic of modular inheritance.</p>
310
+ <ol>
311
+ <li>Does this model have a <code>modular</code> file?</li>
312
+ <li>In this <code>modular</code> file, what models, configurations and processings are imported?</li>
313
+ <li>Recurse through the model list that way.</li>
314
+ </ol>
315
+ <p>So what do we see? Llama is a basis for many models, and it shows.
316
+ Radically different architectures such as mamba have spawned their own dependency subgraph.
317
+ <div class=interactive-demo>
318
+ <div class=demo-header>
319
+ <h3>🔗 Model Dependency Graph</h3>
320
+ </div>
321
+ <div class=demo-content>
322
+ <iframe src=static/d3_dependency_graph.html width=100% height=600px frameborder=0 style="border-radius: 8px; background: white;"></iframe>
323
+ </div>
324
+ <div class=demo-footer>
325
+ Interactive dependency graph showing real relationships between Transformers models. 🟡 Base models (HuggingFace logo), 🔵 Derived modular models. Click and drag to explore!
326
+ </div>
327
+ </div>
328
+
329
+ </p>
330
+ <p><img src="static/graph_modular_related_models.png" alt="Graph showing modular related models"></p>
331
+ <p>But there is no similar miracle for VLMs across the board.
332
+ As you can see, there is a small DETR island, a little llava pocket, and so on, but it’s not comparable to the centrality observed.</p>
333
+ <p>One problem is, this is only for <code>modular</code> models. Several models do NOT have a modular file. In other words, we have a big “hidden space here.”</p>
334
+ <h2>Too many models, yet not enough, are alike</h2>
335
+ <p>So I looked into Jaccard similarity, which we use to measure set differences. I know that code is more than a set of characters stringed together, but it is a correct proxy for now. You can check out [[find_dependencies.py]] .</p>
336
+ <p><div style="background: #f8f9fa; border: 1px solid #e9ecef; border-radius: 8px; padding: 1rem; margin: 1.5rem 0;">
337
+ <h4 style="margin-top: 0; color: #495057;">Interactive Terminal</h4>
338
+ <div style="background: #2d3748; color: #e2e8f0; padding: 1rem; border-radius: 6px; font-family: 'Consolas', 'Monaco', monospace;">
339
+ <div style="margin-bottom: 1rem;">
340
+ <input type=text id=terminal-input placeholder="python -c 'import torch; print(torch.__version__)'" style="width: calc(100% - 80px); padding: 0.5rem; background: #1a202c; border: 1px solid #4a5568; color: #e2e8f0; border-radius: 4px;">
341
+ <button id=terminal-run style="width: 70px; padding: 0.5rem; margin-left: 8px; background: #3182ce; color: white; border: none; border-radius: 4px; cursor: pointer;">Run</button>
342
+ </div>
343
+ <pre id=terminal-output style="background: #1a202c; padding: 1rem; border-radius: 4px; min-height: 100px; margin: 0; overflow-x: auto;">$ Ready to run commands...</pre>
344
+ </div>
345
+ <p style="font-size: 0.9em; color: #6c757d; margin-top: 0.5rem;">
346
+ <em>Note: This is a simulated terminal. In the original Gradio app, this would execute real Python commands with proper security restrictions.</em>
347
+ </p>
348
+ </div>
349
+
350
+ <script>document.addEventListener("DOMContentLoaded",function(){let e=document.getElementById("terminal-input"),t=document.getElementById("terminal-run"),n=document.getElementById("terminal-output");function o(){let t=e.value.trim();t&&(n.textContent=`$ ${t}
351
+ Simulated output for: ${t}
352
+
353
+ This would execute the command in the original app.
354
+ Example outputs:
355
+ - torch version: 2.0.1+cu117
356
+ - import checks: Success
357
+ - memory info: Available`)}t.addEventListener("click",o),e.addEventListener("keypress",function(e){"Enter"===e.key&&o()})})</script></p>
358
+ <p><img src="static/Jaccard_similarity_plot.png" alt="Jaccard similarity plot showing model relationships"></p>
359
+ <p>The yellow areas are places where models are very different to each other. We can see islands here and there corresponding to model families. Llava goes with Llava-onevision, LlavaNext, LlavaNext-video, etc.</p>
360
+ <h2>VLM improvements, avoiding abstraction</h2>
361
+ <p>We don’t have cookbook for common VLM patterns (image token scatter, multi‑tower encoders, cross‑attn bridges). This is one of the main improvement points where we can work.</p>
362
+ <p>So initially I thought of abstracting away the mixing of <code>inputs_embeds</code>, the tensor fed into an llm decoder in 95% of the existing VLMs. It would have looked like something like</p>
363
+ <pre><code class="language-python">class InputsEmbeddingMixerMixin(nn.Module):
364
+ #
365
+ </code></pre>
366
+ <p>But this is breaking <a href="#standardize-dont-abstract">Standardize, don’t abstract</a>. Embedding mixin is part of the model, removing it would break it. A user opening <code>modeling_qwen2.5_vl</code> should not have to go to another file.</p>
367
+ <p>This is the current state of abstractions across a modeling file:</p>
368
+ <p><img src="static/Bloatedness_visualizer.png" alt="Bloatedness visualizer showing abstraction levels"></p>
369
+ <p>The following <a href="https://github.com/huggingface/transformers/pull/39777">Pull request to standardize placeholder masking</a> is a good example of what kind of changes are acceptable. In a VLM, we always need to insert embeddings from various encoders at various positions, so we can have a function to do it. For Qwen2 VL, for instance, it will look like this:</p>
370
+ <pre><code class="language-python"> def get_placeholder_mask(
371
+ self,
372
+ input_ids: torch.LongTensor,
373
+ inputs_embeds: torch.FloatTensor,
374
+ image_features: torch.FloatTensor = None,
375
+ video_features: torch.FloatTensor = None,
376
+ ):
377
+ &quot;&quot;&quot;
378
+ Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
379
+ equal to the length of multimodal features. If the lengths are different, an error is raised.
380
+ &quot;&quot;&quot;
381
+ if input_ids is None:
382
+ special_image_mask = inputs_embeds == self.get_input_embeddings()(
383
+ torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
384
+ )
385
+ special_image_mask = special_image_mask.all(-1)
386
+ special_video_mask = inputs_embeds == self.get_input_embeddings()(
387
+ torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
388
+ )
389
+ special_video_mask = special_video_mask.all(-1)
390
+ else:
391
+ special_image_mask = input_ids == self.config.image_token_id
392
+ special_video_mask = input_ids == self.config.video_token_id
393
+
394
+ n_image_tokens = special_image_mask.sum()
395
+ special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
396
+ if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
397
+ raise ValueError(
398
+ f&quot;Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}&quot;
399
+ )
400
+
401
+ n_video_tokens = special_video_mask.sum()
402
+ special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
403
+ if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
404
+ raise ValueError(
405
+ f&quot;Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}&quot;
406
+ )
407
+
408
+ return special_image_mask, special_video_mask
409
+ </code></pre>
410
+ <p>But this is <em>within</em> the modeling file, not in the <code>PreTrainedModel</code> base class. It will not move away from it, because it’d break the self-contained logic of the model.</p>
411
+ <h2>Modularity candidates</h2>
412
+ <p>So the question abounds naturally: How can we modularize more?
413
+ I took again a similarity measure and looked at the existing graphs. The tool is available on this <a href="https://huggingface.co/spaces/Molbap/transformers-modular-refactor">ZeroGPU-enabled Space</a>. It scans the whole transformers repository, and outputs a graph of candidates across models, using either a Jaccard similarity index (simple) or a SentenceTransformers embedding model. It is understandable that <a href="#encoders-ftw">encoder models still have a lion’s share of the game.</a> See also <a href="https://huggingface.co/blog/train-sparse-encoder">Tom Aarsen and Arhur Bresnu’s great blog post on the topic of sparse embeddings.</a>.</p>
414
+ <p><img src="static/modular_candidates.png" alt="Modular candidates analysis"></p>
415
+ <h2><a id="encoders-ftw"></a> The neverending stories of encoder models.</h2>
416
+ <p>Models popularity speaks for itself! This is because the usage of encoders lies in embeddings obviously. So we have to keep the encoders part viable, usable, fine-tune-able.</p>
417
+ <p><img src="static/popular_models_barplot.png" alt="Popular models bar plot"></p>
418
+ <h2>On image processing and processors</h2>
419
+ <p>Choosing to be a <code>torch</code>-first software meant relieving a tremendous amount of support from <code>jax </code> and <code>TensorFlow</code> , and it also meant that we could be more lenient into the amount of torch-dependent utilities that we were able to add. One of these is the <em>fast processing</em> of images. Where they were before assumed to be minimal ndarrays, making stronger assumptions and enforcing <code>torch</code> and <code>torchvision</code>native inputs allowed up to speed up massively the processing time for each model.</p>
420
+ <p>The gains in performance are immense, up to 20x speed for most models when compiled torchvision ops.</p>
421
+ <h2>Reduce barrier to entry/contribution</h2>
422
+ <p>This is an overall objective, no transformers without community.</p>
423
+ <p>We didn’t want to make a toolbox, old tenet, because <em>having a framework means forcing users into it</em>. It restrains flexibility and creativity, which are the fertile soil for new ideas to grow.
424
+ Among the most valuable contributions to <code>transformers</code>is of course the addition of new models.</p>
425
+ <h2>A surgical toolbox for model development</h2>
426
+ <h3>Attention visualisation</h3>
427
+ <p>If all models have the same API internally for attention computation, it allows us to build cool tools to visualize the inner workings of the attention mechanism. One particular piece of
428
+ machinery is the <code>attention mask</code>, cause of confusion. Thankfully, we can fix it.</p>
429
+ <p><div style="border: 1px solid #e2e8f0; border-radius: 8px; background: white; margin: 1.5rem 0;">
430
+ <div style="padding: 1rem; border-bottom: 1px solid #e2e8f0; background: #f8f9fa;">
431
+ <h4 style="margin: 0 0 0.5rem 0; color: #495057;">🔍 Attention Mask Visualizer</h4>
432
+ <p style="margin: 0; font-size: 0.9em; color: #6c757d;">
433
+ Visualize attention patterns in transformer models. This helps debug attention mask issues.
434
+ </p>
435
+ </div>
436
+
437
+ <div style="padding: 1rem;">
438
+ <div style="display: grid; grid-template-columns: 1fr auto; gap: 1rem; align-items: start; margin-bottom: 1rem;">
439
+ <div>
440
+ <label style="display: block; font-weight: 600; margin-bottom: 0.5rem; color: #374151;">Model:</label>
441
+ <select id=model-select style="width: 100%; padding: 0.5rem; border: 1px solid #d1d5db; border-radius: 6px; background: white;">
442
+ <option value=openai-community/gpt2>openai-community/gpt2</option>
443
+ <option value=google/gemma-2-2b>google/gemma-2-2b</option>
444
+ <option value=microsoft/DialoGPT-small>microsoft/DialoGPT-small</option>
445
+ </select>
446
+ </div>
447
+
448
+ <div>
449
+ <label style="display: block; font-weight: 600; margin-bottom: 0.5rem; color: #374151;">Action:</label>
450
+ <button id=visualize-btn style="padding: 0.5rem 1rem; background: #3b82f6; color: white; border: none; border-radius: 6px; cursor: pointer; font-weight: 500;">
451
+ 🚀 Visualize
452
+ </button>
453
+ </div>
454
+ </div>
455
+
456
+ <div style="margin-bottom: 1rem;">
457
+ <label style="display: block; font-weight: 600; margin-bottom: 0.5rem; color: #374151;">Prompt:</label>
458
+ <textarea id=prompt-input style="width: 100%; padding: 0.75rem; border: 1px solid #d1d5db; border-radius: 6px; resize: vertical; font-family: monospace; font-size: 0.9em;" rows=3 placeholder="You are an assistant. Make sure you print me."></textarea>
459
+ </div>
460
+
461
+ <div id=attention-output style="min-height: 200px; background: #f8f9fa; border: 1px solid #e9ecef; border-radius: 6px; padding: 1rem;">
462
+ <div style="text-align: center; color: #6c757d; font-style: italic;">
463
+ Click "Visualize" to generate attention visualization
464
+ </div>
465
+ </div>
466
+ </div>
467
+
468
+ <div style="padding: 1rem; border-top: 1px solid #e2e8f0; background: #f8f9fa; font-size: 0.9em; color: #6c757d;">
469
+ <strong>Note:</strong> This is a demonstration. In the original Gradio app, this would use GPU processing with ZeroGPU
470
+ to generate real attention visualizations from transformer models.
471
+ </div>
472
+ </div>
473
+
474
+ <script>document.addEventListener("DOMContentLoaded",function(){let e=document.getElementById("model-select"),t=document.getElementById("prompt-input"),n=document.getElementById("visualize-btn"),i=document.getElementById("attention-output");t.value="You are an assistant. Make sure you print me.",n.addEventListener("click",function(){let o=e.value,r=t.value.trim();if(!r){i.innerHTML='<div style="color: #e53e3e;">Please enter a prompt</div>';return}n.disabled=!0,n.textContent="Processing...",i.innerHTML='<div style="text-align: center; color: #6c757d;"><em>Generating attention visualization...</em></div>',setTimeout(()=>{let e=r.split(" ").slice(0,8),t='<div style="margin-bottom: 1rem;"><strong>Model:</strong> '+o+"</div>";t+='<div style="margin-bottom: 1rem;"><strong>Tokens:</strong> '+e.join(" • ")+'</div><div><strong>Attention Matrix (Layer 0, Head 0):</strong></div><table style="margin-top: 0.5rem; border-collapse: collapse; font-family: monospace; font-size: 0.8em;">';for(let n=0;n<e.length;n++){t+="<tr>";for(let n=0;n<e.length;n++){let e=Math.random(),n=`rgba(59, 130, 246, ${e})`;t+=`<td style="border: 1px solid #ddd; padding: 4px; background: ${n}; text-align: center; min-width: 40px;">${e.toFixed(2)}</td>`}t+="</tr>"}i.innerHTML=t+='</table><div style="margin-top: 1rem; font-size: 0.9em; color: #6c757d;"><em>Darker blue = higher attention weight</em></div>',n.disabled=!1,n.textContent="\uD83D\uDE80 Visualize"},2e3)})})</script></p>
475
+ <p>Because it is all PyTorch (and it is even more now that we support only PyTorch), we can easily debug any model when we want to add it to transformers. We now have a power-user tool for porting or adding models, that wraps a forward pass, intercepts every submodule call, and logs shapes, dtypes, and sample statistics of inputs/outputs to nested JSON.</p>
476
+ <p>It just works with PyTorch models and is especially useful when aligning outputs with a reference implementation, aligned with our core guideline, <a href="#source-of-truth">source of truth for model definitions</a>.</p>
477
+ <p><img src="static/model_debugger.png" alt="Model debugger interface"></p>
478
+ <h3>Transformers-serve</h3>
479
+ <p>Having all these models readily available allows to use all of them with transformers-serve, and enable interfacing with them with an Open API-like pattern.</p>
480
+ <p>#### add example</p>
481
+ <h2>Community reusability</h2>
482
+ <p>Adding a model to transformers means:</p>
483
+ <ul>
484
+ <li>having it immediately available to the community</li>
485
+ <li>usable in vLLM, SGLang, and so on without additional code.</li>
486
+ </ul>
487
+ <p>## Inner cooking: CUDA Warmup</p>
488
+ <p>Having a clean <em>external</em> API allows us to work on the true inner workings of transformers. One of the few recent additions was the <em>CUDA warmup</em> via <code>caching_allocator_warmup</code> which improved massively the loading time by pre-allocating GPU memory to avoid malloc bottlenecks during model loading.</p>
489
+ <p><div style="border: 1px solid #e2e8f0; border-radius: 8px; background: white; margin: 1.5rem 0;">
490
+ <div style="padding: 1rem; border-bottom: 1px solid #e2e8f0; background: #f8f9fa;">
491
+ <h4 style="margin: 0 0 0.5rem 0; color: #495057;">🚀 CUDA Warmup Efficiency Benchmark</h4>
492
+ <p style="margin: 0; font-size: 0.9em; color: #6c757d;">
493
+ Real CUDA warmup benchmarking with actual Transformers models. Measure the performance impact of the caching_allocator_warmup function.
494
+ </p>
495
+ </div>
496
+
497
+ <div style="padding: 1rem;">
498
+ <iframe src=https://molbap-cuda-warmup-transformers.hf.space width=100% height=800px frameborder=0 style="border-radius: 8px; background: white;"></iframe>
499
+ </div>
500
+
501
+ <div style="padding: 1rem; border-top: 1px solid #e2e8f0; background: #f8f9fa; font-size: 0.9em; color: #6c757d;">
502
+ Real CUDA warmup benchmarking with actual Transformers models. Measure the performance impact of the <code>caching_allocator_warmup</code> function at <code>transformers/src/transformers/modeling_utils.py:6186</code>. This interactive tool loads models twice - once with warmup disabled and once with warmup enabled - to demonstrate the significant loading time improvements.
503
+ </div>
504
+ </div></p>
505
+ <h3>Linkedin post (to remove)</h3>
506
+ <p>Linkedin post for videos:</p>
507
+ <p>In transformers, how do we deal with cross-model dependencies, while supporting ~400 models? Maybe you’ve seen the same 200-lines functions in too many <em>modeling_file.py</em>? Duplication isn’t inevitable.</p>
508
+ <p>The “one‑model/one‑file” rule keeps every model readable and runnable. It also means identical code is copied hundreds of times. Maintenance hurts, contributor PRs snowball, and vision–language models especially end up in siloed forks.</p>
509
+ <p>modular_*.py fixes the trade‑off, by auto-generating the modeling file from a modular file, which can use inheritance.</p>
510
+ <p>With a small analyser I’ve mapped which models already share modular pieces and which 100‑plus still repeat themselves. Red nodes in the graph = lowest‑hanging fruit for refactor; blue = already modular.</p>
511
+ <p>The result: contributors can focus on novel layers instead of boilerplate, reviews shrink from “new file diff” to “does this override make sense?”, and the codebase stays something you can actually open and read.</p>
512
+ <p>If you maintain or ship models on top of Transformers, take a look at modular, in 2025 it’s how we keep shipping breadth without the bloat. 🛠️</p>
513
+
514
+ </d-article>
515
+
516
+ <script>
517
+ function initializeTOC() {
518
+ const article = document.querySelector('d-article');
519
+ const toc = document.querySelector('d-contents');
520
+ if (toc) {
521
+ const headings = article.querySelectorAll('h1, h2, h3, h4');
522
+ let ToC = '<nav role="navigation" class="l-text figcaption">';
523
+ ToC += '<div class="toc-header"><span class="toc-title">Table of Contents</span></div>';
524
+ ToC += '<div class="toc-content">';
525
+
526
+ headings.forEach((heading, index) => {
527
+ const id = heading.id || 'heading-' + index;
528
+ if (!heading.id) heading.id = id;
529
+ const level = parseInt(heading.tagName.charAt(1));
530
+ const indent = level === 1 ? '' : 'style="margin-left: ' + ((level - 1) * 1.2) + 'em;"';
531
+ ToC += '<div ' + indent + '><a href="#' + id + '">' + heading.textContent + '</a></div>';
532
+ });
533
+
534
+ ToC += '</div></nav>';
535
+ toc.innerHTML = ToC;
536
+ toc.setAttribute('prerendered', 'true');
537
+
538
+ // Extract tenet text for tooltips
539
+ const tenetTooltips = {
540
+ 'source-of-truth': 'We should be a source of truth for all model definitions. Model implementations should be reliable, reproducible, and faithful to the original performances.',
541
+ 'one-model-one-file': 'All inference (and most of training, loss is separate, not a part of model) logic visible, top‑to‑bottom.',
542
+ 'code-is-product': 'Optimize for reading, diffing, and tweaking, our users are power users. Variables can be explicit, full words, even several words, readability is primordial.',
543
+ 'standardize-dont-abstract': 'If it\'s model behavior, keep it in the file; abstractions only for generic infra.',
544
+ 'do-repeat-yourself': 'Copy when it helps users; keep successors in sync without centralizing behavior.',
545
+ 'minimal-user-api': 'Config, model, preprocessing; from_pretrained, save_pretrained, push_to_hub. We want the least amount of codepaths.',
546
+ 'backwards-compatibility': 'Evolve by additive standardization, never break public APIs.',
547
+ 'consistent-public-surface': 'Same argument names, same outputs, hidden states and attentions exposed.',
548
+ };
549
+
550
+ // Add smooth scrolling and active state
551
+ const tocLinks = document.querySelectorAll('d-contents a');
552
+ tocLinks.forEach(link => {
553
+ const href = link.getAttribute('href');
554
+ const anchor = href ? href.substring(1) : '';
555
+
556
+ // Add tooltip if this is a tenet link
557
+ if (tenetTooltips[anchor]) {
558
+ link.setAttribute('title', tenetTooltips[anchor]);
559
+ link.style.position = 'relative';
560
+ }
561
+
562
+ link.addEventListener('click', function(e) {
563
+ e.preventDefault();
564
+ const target = document.querySelector(this.getAttribute('href'));
565
+ if (target) {
566
+ target.scrollIntoView({ behavior: 'smooth' });
567
+ }
568
+ });
569
+ });
570
+
571
+ // Update active state on scroll
572
+ window.addEventListener('scroll', function() {
573
+ const scrollPos = window.scrollY + 100;
574
+ headings.forEach((heading) => {
575
+ const link = document.querySelector('d-contents a[href="#' + heading.id + '"]');
576
+ if (link) {
577
+ if (heading.offsetTop <= scrollPos &&
578
+ heading.offsetTop + heading.offsetHeight > scrollPos) {
579
+ link.classList.add('active');
580
+ } else {
581
+ link.classList.remove('active');
582
+ }
583
+ }
584
+ });
585
+ });
586
+ }
587
+ }
588
+
589
+ // Initialize Prism syntax highlighting
590
+ function initializeSyntaxHighlighting() {
591
+ if (typeof Prism !== 'undefined') {
592
+ Prism.highlightAll();
593
+ }
594
+ }
595
+
596
+ // Try multiple times to ensure it runs after distill.js
597
+ document.addEventListener('DOMContentLoaded', function() {
598
+ initializeTOC();
599
+ initializeSyntaxHighlighting();
600
+ });
601
+ setTimeout(function() {
602
+ initializeTOC();
603
+ initializeSyntaxHighlighting();
604
+ }, 100);
605
+ setTimeout(function() {
606
+ initializeTOC();
607
+ initializeSyntaxHighlighting();
608
+ }, 500);
609
+ setTimeout(function() {
610
+ initializeTOC();
611
+ initializeSyntaxHighlighting();
612
+ }, 1000);
613
+ </script>
614
+ </body>
615
+ </html>
dist/main.bundle.js ADDED
The diff for this file is too large to render. See raw diff
 
dist/main.bundle.js.map ADDED
The diff for this file is too large to render. See raw diff
 
dist/static/Bloatedness_visualizer.png ADDED

Git LFS Details

  • SHA256: 6e30ca37e88572e00b06651728b5667464837c69c18b22c97a04127367d8a500
  • Pointer size: 131 Bytes
  • Size of remote file: 118 kB
dist/static/Jaccard_similarity_plot.png ADDED

Git LFS Details

  • SHA256: 486fad0f93c66d7ccc9fb35bad4ef75a8e49fbe6d48e5adbab6eda6e9367b653
  • Pointer size: 130 Bytes
  • Size of remote file: 65.9 kB
dist/static/d3_dependency_graph.html ADDED
@@ -0,0 +1,1902 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ <!DOCTYPE html>
3
+ <html lang="en">
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <title>Transformers Modular Model Dependencies</title>
7
+ <style>
8
+ /* Google‑font – small fallback cost & optional */
9
+ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;600&display=swap');
10
+
11
+ :root {
12
+ --base‑size: 60px; /* icon radius helper */
13
+ }
14
+
15
+ body {
16
+ font-family: 'Inter', Arial, sans-serif;
17
+ margin: 0;
18
+ overflow: hidden;
19
+ background-color: transparent; /* requested transparency */
20
+ }
21
+
22
+ svg {
23
+ width: 100vw;
24
+ height: 100vh;
25
+ }
26
+
27
+ .link {
28
+ stroke: #999;
29
+ stroke-opacity: 0.6;
30
+ }
31
+
32
+ .node-label {
33
+ fill: #333;
34
+ pointer-events: none;
35
+ text-anchor: middle;
36
+ font-weight: 600;
37
+ }
38
+
39
+ .link-label {
40
+ fill: #555;
41
+ font-size: 10px;
42
+ pointer-events: none;
43
+ text-anchor: middle;
44
+ }
45
+
46
+ .node.base path { fill: #ffbe0b; }
47
+ .node.derived circle { fill: #1f77b4; }
48
+
49
+ /* Legend styling */
50
+ #legend {
51
+ position: fixed;
52
+ top: 18px;
53
+ left: 18px;
54
+ font-size: 20px;
55
+ background: rgba(255,255,255,0.92);
56
+ padding: 18px 28px;
57
+ border-radius: 10px;
58
+ border: 1.5px solid #bbb;
59
+ font-family: 'Inter', Arial, sans-serif;
60
+ box-shadow: 0 2px 8px rgba(0,0,0,0.08);
61
+ z-index: 1000;
62
+ }
63
+ </style>
64
+ </head>
65
+ <body>
66
+ <div id="legend">🟡 base model (HF icon)<br>🔵 derived modular model<br>Edge label: #classes imported</div>
67
+ <svg id="dependency-graph"></svg>
68
+ <script src="https://d3js.org/d3.v7.min.js"></script>
69
+ <script>
70
+ const graphData = {
71
+ "nodes": [
72
+ {
73
+ "id": "aimv2",
74
+ "is_base": false,
75
+ "size": 1.146341463414634
76
+ },
77
+ {
78
+ "id": "arcee",
79
+ "is_base": false,
80
+ "size": 1.0975609756097562
81
+ },
82
+ {
83
+ "id": "aria",
84
+ "is_base": false,
85
+ "size": 1.146341463414634
86
+ },
87
+ {
88
+ "id": "auto",
89
+ "is_base": true,
90
+ "size": 1.0975609756097562
91
+ },
92
+ {
93
+ "id": "aya_vision",
94
+ "is_base": false,
95
+ "size": 1.048780487804878
96
+ },
97
+ {
98
+ "id": "bamba",
99
+ "is_base": false,
100
+ "size": 1.2439024390243902
101
+ },
102
+ {
103
+ "id": "bart",
104
+ "is_base": true,
105
+ "size": 1.146341463414634
106
+ },
107
+ {
108
+ "id": "beit",
109
+ "is_base": true,
110
+ "size": 1.048780487804878
111
+ },
112
+ {
113
+ "id": "bigbird_pegasus",
114
+ "is_base": true,
115
+ "size": 1.048780487804878
116
+ },
117
+ {
118
+ "id": "biogpt",
119
+ "is_base": false,
120
+ "size": 1.0975609756097562
121
+ },
122
+ {
123
+ "id": "bitnet",
124
+ "is_base": false,
125
+ "size": 1.0975609756097562
126
+ },
127
+ {
128
+ "id": "blip",
129
+ "is_base": true,
130
+ "size": 1.048780487804878
131
+ },
132
+ {
133
+ "id": "blip_2",
134
+ "is_base": true,
135
+ "size": 1.048780487804878
136
+ },
137
+ {
138
+ "id": "chameleon",
139
+ "is_base": true,
140
+ "size": 1.0975609756097562
141
+ },
142
+ {
143
+ "id": "clip",
144
+ "is_base": true,
145
+ "size": 1.2439024390243902
146
+ },
147
+ {
148
+ "id": "cohere",
149
+ "is_base": false,
150
+ "size": 1.1951219512195121
151
+ },
152
+ {
153
+ "id": "cohere2",
154
+ "is_base": false,
155
+ "size": 1.0975609756097562
156
+ },
157
+ {
158
+ "id": "colpali",
159
+ "is_base": false,
160
+ "size": 1.0975609756097562
161
+ },
162
+ {
163
+ "id": "colqwen2",
164
+ "is_base": false,
165
+ "size": 1.048780487804878
166
+ },
167
+ {
168
+ "id": "conditional_detr",
169
+ "is_base": false,
170
+ "size": 1.048780487804878
171
+ },
172
+ {
173
+ "id": "csm",
174
+ "is_base": false,
175
+ "size": 1.048780487804878
176
+ },
177
+ {
178
+ "id": "d_fine",
179
+ "is_base": false,
180
+ "size": 1.0975609756097562
181
+ },
182
+ {
183
+ "id": "data2vec",
184
+ "is_base": false,
185
+ "size": 1.048780487804878
186
+ },
187
+ {
188
+ "id": "deepseek_v2",
189
+ "is_base": false,
190
+ "size": 1.0975609756097562
191
+ },
192
+ {
193
+ "id": "deepseek_v3",
194
+ "is_base": false,
195
+ "size": 1.146341463414634
196
+ },
197
+ {
198
+ "id": "deepseek_vl",
199
+ "is_base": false,
200
+ "size": 1.146341463414634
201
+ },
202
+ {
203
+ "id": "deepseek_vl_hybrid",
204
+ "is_base": false,
205
+ "size": 1.146341463414634
206
+ },
207
+ {
208
+ "id": "deformable_detr",
209
+ "is_base": false,
210
+ "size": 1.048780487804878
211
+ },
212
+ {
213
+ "id": "depth_anything",
214
+ "is_base": true,
215
+ "size": 1.048780487804878
216
+ },
217
+ {
218
+ "id": "detr",
219
+ "is_base": true,
220
+ "size": 1.2439024390243902
221
+ },
222
+ {
223
+ "id": "dia",
224
+ "is_base": false,
225
+ "size": 1.0975609756097562
226
+ },
227
+ {
228
+ "id": "diffllama",
229
+ "is_base": false,
230
+ "size": 1.146341463414634
231
+ },
232
+ {
233
+ "id": "dinov2",
234
+ "is_base": true,
235
+ "size": 1.0975609756097562
236
+ },
237
+ {
238
+ "id": "dinov2_with_registers",
239
+ "is_base": false,
240
+ "size": 1.048780487804878
241
+ },
242
+ {
243
+ "id": "doge",
244
+ "is_base": false,
245
+ "size": 1.0975609756097562
246
+ },
247
+ {
248
+ "id": "dots1",
249
+ "is_base": false,
250
+ "size": 1.0975609756097562
251
+ },
252
+ {
253
+ "id": "dpt",
254
+ "is_base": false,
255
+ "size": 1.0975609756097562
256
+ },
257
+ {
258
+ "id": "emu3",
259
+ "is_base": false,
260
+ "size": 1.146341463414634
261
+ },
262
+ {
263
+ "id": "eomt",
264
+ "is_base": false,
265
+ "size": 1.1951219512195121
266
+ },
267
+ {
268
+ "id": "ernie4_5",
269
+ "is_base": false,
270
+ "size": 1.146341463414634
271
+ },
272
+ {
273
+ "id": "ernie4_5_moe",
274
+ "is_base": false,
275
+ "size": 1.1951219512195121
276
+ },
277
+ {
278
+ "id": "esm",
279
+ "is_base": true,
280
+ "size": 1.048780487804878
281
+ },
282
+ {
283
+ "id": "evolla",
284
+ "is_base": false,
285
+ "size": 1.0975609756097562
286
+ },
287
+ {
288
+ "id": "exaone4",
289
+ "is_base": false,
290
+ "size": 1.0975609756097562
291
+ },
292
+ {
293
+ "id": "falcon_h1",
294
+ "is_base": false,
295
+ "size": 1.146341463414634
296
+ },
297
+ {
298
+ "id": "falcon_mamba",
299
+ "is_base": false,
300
+ "size": 1.048780487804878
301
+ },
302
+ {
303
+ "id": "gemma",
304
+ "is_base": false,
305
+ "size": 1.3414634146341464
306
+ },
307
+ {
308
+ "id": "gemma2",
309
+ "is_base": false,
310
+ "size": 1.2439024390243902
311
+ },
312
+ {
313
+ "id": "gemma3",
314
+ "is_base": false,
315
+ "size": 1.146341463414634
316
+ },
317
+ {
318
+ "id": "gemma3n",
319
+ "is_base": false,
320
+ "size": 1.1951219512195121
321
+ },
322
+ {
323
+ "id": "glm",
324
+ "is_base": false,
325
+ "size": 1.2439024390243902
326
+ },
327
+ {
328
+ "id": "glm4",
329
+ "is_base": false,
330
+ "size": 1.146341463414634
331
+ },
332
+ {
333
+ "id": "glm4_moe",
334
+ "is_base": false,
335
+ "size": 1.146341463414634
336
+ },
337
+ {
338
+ "id": "glm4v",
339
+ "is_base": false,
340
+ "size": 1.0975609756097562
341
+ },
342
+ {
343
+ "id": "got_ocr2",
344
+ "is_base": false,
345
+ "size": 1.0975609756097562
346
+ },
347
+ {
348
+ "id": "gpt_neox",
349
+ "is_base": false,
350
+ "size": 1.0975609756097562
351
+ },
352
+ {
353
+ "id": "granite",
354
+ "is_base": false,
355
+ "size": 1.0975609756097562
356
+ },
357
+ {
358
+ "id": "granitemoe",
359
+ "is_base": true,
360
+ "size": 1.048780487804878
361
+ },
362
+ {
363
+ "id": "granitemoehybrid",
364
+ "is_base": false,
365
+ "size": 1.0975609756097562
366
+ },
367
+ {
368
+ "id": "granitemoeshared",
369
+ "is_base": false,
370
+ "size": 1.0975609756097562
371
+ },
372
+ {
373
+ "id": "grounding_dino",
374
+ "is_base": false,
375
+ "size": 1.048780487804878
376
+ },
377
+ {
378
+ "id": "helium",
379
+ "is_base": false,
380
+ "size": 1.146341463414634
381
+ },
382
+ {
383
+ "id": "hgnet_v2",
384
+ "is_base": false,
385
+ "size": 1.048780487804878
386
+ },
387
+ {
388
+ "id": "hubert",
389
+ "is_base": false,
390
+ "size": 1.048780487804878
391
+ },
392
+ {
393
+ "id": "idefics",
394
+ "is_base": true,
395
+ "size": 1.146341463414634
396
+ },
397
+ {
398
+ "id": "idefics3",
399
+ "is_base": true,
400
+ "size": 1.048780487804878
401
+ },
402
+ {
403
+ "id": "ijepa",
404
+ "is_base": false,
405
+ "size": 1.048780487804878
406
+ },
407
+ {
408
+ "id": "image_processing_base",
409
+ "is_base": true,
410
+ "size": 1.048780487804878
411
+ },
412
+ {
413
+ "id": "informer",
414
+ "is_base": false,
415
+ "size": 1.0975609756097562
416
+ },
417
+ {
418
+ "id": "instructblip",
419
+ "is_base": true,
420
+ "size": 1.048780487804878
421
+ },
422
+ {
423
+ "id": "instructblipvideo",
424
+ "is_base": false,
425
+ "size": 1.0975609756097562
426
+ },
427
+ {
428
+ "id": "internvl",
429
+ "is_base": false,
430
+ "size": 1.1951219512195121
431
+ },
432
+ {
433
+ "id": "jamba",
434
+ "is_base": true,
435
+ "size": 1.0975609756097562
436
+ },
437
+ {
438
+ "id": "janus",
439
+ "is_base": false,
440
+ "size": 1.3902439024390243
441
+ },
442
+ {
443
+ "id": "kyutai_speech_to_text",
444
+ "is_base": false,
445
+ "size": 1.146341463414634
446
+ },
447
+ {
448
+ "id": "lfm2",
449
+ "is_base": false,
450
+ "size": 1.0975609756097562
451
+ },
452
+ {
453
+ "id": "lightglue",
454
+ "is_base": false,
455
+ "size": 1.2439024390243902
456
+ },
457
+ {
458
+ "id": "llama",
459
+ "is_base": true,
460
+ "size": 3.0
461
+ },
462
+ {
463
+ "id": "llama4",
464
+ "is_base": true,
465
+ "size": 1.048780487804878
466
+ },
467
+ {
468
+ "id": "llava",
469
+ "is_base": true,
470
+ "size": 1.3414634146341464
471
+ },
472
+ {
473
+ "id": "llava_next",
474
+ "is_base": true,
475
+ "size": 1.146341463414634
476
+ },
477
+ {
478
+ "id": "llava_next_video",
479
+ "is_base": false,
480
+ "size": 1.0975609756097562
481
+ },
482
+ {
483
+ "id": "llava_onevision",
484
+ "is_base": false,
485
+ "size": 1.0975609756097562
486
+ },
487
+ {
488
+ "id": "mamba",
489
+ "is_base": true,
490
+ "size": 1.048780487804878
491
+ },
492
+ {
493
+ "id": "mamba2",
494
+ "is_base": true,
495
+ "size": 1.146341463414634
496
+ },
497
+ {
498
+ "id": "mask2former",
499
+ "is_base": false,
500
+ "size": 1.0975609756097562
501
+ },
502
+ {
503
+ "id": "maskformer",
504
+ "is_base": true,
505
+ "size": 1.048780487804878
506
+ },
507
+ {
508
+ "id": "mbart",
509
+ "is_base": true,
510
+ "size": 1.048780487804878
511
+ },
512
+ {
513
+ "id": "mimi",
514
+ "is_base": true,
515
+ "size": 1.048780487804878
516
+ },
517
+ {
518
+ "id": "minimax",
519
+ "is_base": false,
520
+ "size": 1.048780487804878
521
+ },
522
+ {
523
+ "id": "mistral",
524
+ "is_base": false,
525
+ "size": 1.3414634146341464
526
+ },
527
+ {
528
+ "id": "mistral3",
529
+ "is_base": false,
530
+ "size": 1.0975609756097562
531
+ },
532
+ {
533
+ "id": "mixtral",
534
+ "is_base": false,
535
+ "size": 1.2439024390243902
536
+ },
537
+ {
538
+ "id": "mlcd",
539
+ "is_base": false,
540
+ "size": 1.146341463414634
541
+ },
542
+ {
543
+ "id": "modeling_outputs",
544
+ "is_base": true,
545
+ "size": 1.048780487804878
546
+ },
547
+ {
548
+ "id": "modernbert",
549
+ "is_base": false,
550
+ "size": 1.0975609756097562
551
+ },
552
+ {
553
+ "id": "modernbert_decoder",
554
+ "is_base": false,
555
+ "size": 1.048780487804878
556
+ },
557
+ {
558
+ "id": "moonshine",
559
+ "is_base": false,
560
+ "size": 1.146341463414634
561
+ },
562
+ {
563
+ "id": "moshi",
564
+ "is_base": true,
565
+ "size": 1.048780487804878
566
+ },
567
+ {
568
+ "id": "nemotron",
569
+ "is_base": true,
570
+ "size": 1.048780487804878
571
+ },
572
+ {
573
+ "id": "olmo",
574
+ "is_base": false,
575
+ "size": 1.0975609756097562
576
+ },
577
+ {
578
+ "id": "olmo2",
579
+ "is_base": false,
580
+ "size": 1.146341463414634
581
+ },
582
+ {
583
+ "id": "opt",
584
+ "is_base": true,
585
+ "size": 1.048780487804878
586
+ },
587
+ {
588
+ "id": "owlv2",
589
+ "is_base": false,
590
+ "size": 1.048780487804878
591
+ },
592
+ {
593
+ "id": "owlvit",
594
+ "is_base": true,
595
+ "size": 1.048780487804878
596
+ },
597
+ {
598
+ "id": "paligemma",
599
+ "is_base": true,
600
+ "size": 1.146341463414634
601
+ },
602
+ {
603
+ "id": "perception_lm",
604
+ "is_base": false,
605
+ "size": 1.048780487804878
606
+ },
607
+ {
608
+ "id": "phi",
609
+ "is_base": false,
610
+ "size": 1.0975609756097562
611
+ },
612
+ {
613
+ "id": "phi3",
614
+ "is_base": false,
615
+ "size": 1.2439024390243902
616
+ },
617
+ {
618
+ "id": "phi4_multimodal",
619
+ "is_base": false,
620
+ "size": 1.146341463414634
621
+ },
622
+ {
623
+ "id": "plbart",
624
+ "is_base": false,
625
+ "size": 1.146341463414634
626
+ },
627
+ {
628
+ "id": "prompt_depth_anything",
629
+ "is_base": false,
630
+ "size": 1.048780487804878
631
+ },
632
+ {
633
+ "id": "qwen2",
634
+ "is_base": false,
635
+ "size": 1.1951219512195121
636
+ },
637
+ {
638
+ "id": "qwen2_5_omni",
639
+ "is_base": false,
640
+ "size": 1.1951219512195121
641
+ },
642
+ {
643
+ "id": "qwen2_5_vl",
644
+ "is_base": false,
645
+ "size": 1.146341463414634
646
+ },
647
+ {
648
+ "id": "qwen2_audio",
649
+ "is_base": true,
650
+ "size": 1.0975609756097562
651
+ },
652
+ {
653
+ "id": "qwen2_moe",
654
+ "is_base": true,
655
+ "size": 1.048780487804878
656
+ },
657
+ {
658
+ "id": "qwen2_vl",
659
+ "is_base": true,
660
+ "size": 1.146341463414634
661
+ },
662
+ {
663
+ "id": "qwen3",
664
+ "is_base": false,
665
+ "size": 1.2439024390243902
666
+ },
667
+ {
668
+ "id": "qwen3_moe",
669
+ "is_base": false,
670
+ "size": 1.2439024390243902
671
+ },
672
+ {
673
+ "id": "rt_detr",
674
+ "is_base": false,
675
+ "size": 1.1951219512195121
676
+ },
677
+ {
678
+ "id": "rt_detr_v2",
679
+ "is_base": false,
680
+ "size": 1.0975609756097562
681
+ },
682
+ {
683
+ "id": "sam",
684
+ "is_base": true,
685
+ "size": 1.146341463414634
686
+ },
687
+ {
688
+ "id": "sam_hq",
689
+ "is_base": false,
690
+ "size": 1.0975609756097562
691
+ },
692
+ {
693
+ "id": "sew",
694
+ "is_base": false,
695
+ "size": 1.048780487804878
696
+ },
697
+ {
698
+ "id": "siglip",
699
+ "is_base": true,
700
+ "size": 1.2926829268292683
701
+ },
702
+ {
703
+ "id": "siglip2",
704
+ "is_base": false,
705
+ "size": 1.048780487804878
706
+ },
707
+ {
708
+ "id": "smollm3",
709
+ "is_base": false,
710
+ "size": 1.0975609756097562
711
+ },
712
+ {
713
+ "id": "smolvlm",
714
+ "is_base": false,
715
+ "size": 1.048780487804878
716
+ },
717
+ {
718
+ "id": "starcoder2",
719
+ "is_base": false,
720
+ "size": 1.048780487804878
721
+ },
722
+ {
723
+ "id": "superglue",
724
+ "is_base": true,
725
+ "size": 1.048780487804878
726
+ },
727
+ {
728
+ "id": "t5gemma",
729
+ "is_base": false,
730
+ "size": 1.048780487804878
731
+ },
732
+ {
733
+ "id": "time_series_transformer",
734
+ "is_base": true,
735
+ "size": 1.048780487804878
736
+ },
737
+ {
738
+ "id": "timesfm",
739
+ "is_base": false,
740
+ "size": 1.0975609756097562
741
+ },
742
+ {
743
+ "id": "timm_wrapper",
744
+ "is_base": true,
745
+ "size": 1.048780487804878
746
+ },
747
+ {
748
+ "id": "unispeech",
749
+ "is_base": false,
750
+ "size": 1.048780487804878
751
+ },
752
+ {
753
+ "id": "unispeech_sat",
754
+ "is_base": false,
755
+ "size": 1.048780487804878
756
+ },
757
+ {
758
+ "id": "vipllava",
759
+ "is_base": false,
760
+ "size": 1.048780487804878
761
+ },
762
+ {
763
+ "id": "vit",
764
+ "is_base": true,
765
+ "size": 1.0975609756097562
766
+ },
767
+ {
768
+ "id": "voxtral",
769
+ "is_base": false,
770
+ "size": 1.048780487804878
771
+ },
772
+ {
773
+ "id": "wav2vec2",
774
+ "is_base": true,
775
+ "size": 1.3902439024390243
776
+ },
777
+ {
778
+ "id": "wav2vec2_bert",
779
+ "is_base": false,
780
+ "size": 1.0975609756097562
781
+ },
782
+ {
783
+ "id": "wav2vec2_conformer",
784
+ "is_base": false,
785
+ "size": 1.0975609756097562
786
+ },
787
+ {
788
+ "id": "wavlm",
789
+ "is_base": false,
790
+ "size": 1.048780487804878
791
+ },
792
+ {
793
+ "id": "whisper",
794
+ "is_base": true,
795
+ "size": 1.048780487804878
796
+ },
797
+ {
798
+ "id": "yolos",
799
+ "is_base": false,
800
+ "size": 1.048780487804878
801
+ },
802
+ {
803
+ "id": "zamba",
804
+ "is_base": true,
805
+ "size": 1.048780487804878
806
+ },
807
+ {
808
+ "id": "zamba2",
809
+ "is_base": false,
810
+ "size": 1.146341463414634
811
+ }
812
+ ],
813
+ "links": [
814
+ {
815
+ "source": "llama",
816
+ "target": "doge",
817
+ "label": "8 classes"
818
+ },
819
+ {
820
+ "source": "mixtral",
821
+ "target": "doge",
822
+ "label": "2 classes"
823
+ },
824
+ {
825
+ "source": "mixtral",
826
+ "target": "minimax",
827
+ "label": "11 classes"
828
+ },
829
+ {
830
+ "source": "clip",
831
+ "target": "phi",
832
+ "label": "1 classes"
833
+ },
834
+ {
835
+ "source": "llama",
836
+ "target": "phi",
837
+ "label": "8 classes"
838
+ },
839
+ {
840
+ "source": "qwen2_vl",
841
+ "target": "qwen2_5_vl",
842
+ "label": "15 classes"
843
+ },
844
+ {
845
+ "source": "ernie4_5",
846
+ "target": "ernie4_5_moe",
847
+ "label": "3 classes"
848
+ },
849
+ {
850
+ "source": "llama",
851
+ "target": "ernie4_5_moe",
852
+ "label": "2 classes"
853
+ },
854
+ {
855
+ "source": "mixtral",
856
+ "target": "ernie4_5_moe",
857
+ "label": "2 classes"
858
+ },
859
+ {
860
+ "source": "qwen3_moe",
861
+ "target": "ernie4_5_moe",
862
+ "label": "2 classes"
863
+ },
864
+ {
865
+ "source": "llama",
866
+ "target": "mistral",
867
+ "label": "10 classes"
868
+ },
869
+ {
870
+ "source": "llama",
871
+ "target": "gpt_neox",
872
+ "label": "4 classes"
873
+ },
874
+ {
875
+ "source": "mistral",
876
+ "target": "phi3",
877
+ "label": "7 classes"
878
+ },
879
+ {
880
+ "source": "wav2vec2",
881
+ "target": "unispeech",
882
+ "label": "9 classes"
883
+ },
884
+ {
885
+ "source": "llama",
886
+ "target": "olmo",
887
+ "label": "8 classes"
888
+ },
889
+ {
890
+ "source": "gemma",
891
+ "target": "helium",
892
+ "label": "3 classes"
893
+ },
894
+ {
895
+ "source": "granite",
896
+ "target": "helium",
897
+ "label": "1 classes"
898
+ },
899
+ {
900
+ "source": "llama",
901
+ "target": "helium",
902
+ "label": "5 classes"
903
+ },
904
+ {
905
+ "source": "gemma",
906
+ "target": "bitnet",
907
+ "label": "1 classes"
908
+ },
909
+ {
910
+ "source": "llama",
911
+ "target": "bitnet",
912
+ "label": "7 classes"
913
+ },
914
+ {
915
+ "source": "maskformer",
916
+ "target": "mask2former",
917
+ "label": "1 classes"
918
+ },
919
+ {
920
+ "source": "jamba",
921
+ "target": "falcon_h1",
922
+ "label": "1 classes"
923
+ },
924
+ {
925
+ "source": "llama",
926
+ "target": "falcon_h1",
927
+ "label": "7 classes"
928
+ },
929
+ {
930
+ "source": "mamba2",
931
+ "target": "falcon_h1",
932
+ "label": "4 classes"
933
+ },
934
+ {
935
+ "source": "llava",
936
+ "target": "got_ocr2",
937
+ "label": "6 classes"
938
+ },
939
+ {
940
+ "source": "sam",
941
+ "target": "got_ocr2",
942
+ "label": "5 classes"
943
+ },
944
+ {
945
+ "source": "esm",
946
+ "target": "evolla",
947
+ "label": "9 classes"
948
+ },
949
+ {
950
+ "source": "llama",
951
+ "target": "evolla",
952
+ "label": "6 classes"
953
+ },
954
+ {
955
+ "source": "gemma2",
956
+ "target": "gemma3n",
957
+ "label": "6 classes"
958
+ },
959
+ {
960
+ "source": "gemma3",
961
+ "target": "gemma3n",
962
+ "label": "6 classes"
963
+ },
964
+ {
965
+ "source": "paligemma",
966
+ "target": "gemma3n",
967
+ "label": "4 classes"
968
+ },
969
+ {
970
+ "source": "timm_wrapper",
971
+ "target": "gemma3n",
972
+ "label": "1 classes"
973
+ },
974
+ {
975
+ "source": "llama",
976
+ "target": "csm",
977
+ "label": "8 classes"
978
+ },
979
+ {
980
+ "source": "owlvit",
981
+ "target": "owlv2",
982
+ "label": "1 classes"
983
+ },
984
+ {
985
+ "source": "llama",
986
+ "target": "zamba2",
987
+ "label": "2 classes"
988
+ },
989
+ {
990
+ "source": "mamba2",
991
+ "target": "zamba2",
992
+ "label": "3 classes"
993
+ },
994
+ {
995
+ "source": "zamba",
996
+ "target": "zamba2",
997
+ "label": "10 classes"
998
+ },
999
+ {
1000
+ "source": "blip",
1001
+ "target": "janus",
1002
+ "label": "1 classes"
1003
+ },
1004
+ {
1005
+ "source": "blip_2",
1006
+ "target": "janus",
1007
+ "label": "1 classes"
1008
+ },
1009
+ {
1010
+ "source": "chameleon",
1011
+ "target": "janus",
1012
+ "label": "6 classes"
1013
+ },
1014
+ {
1015
+ "source": "idefics",
1016
+ "target": "janus",
1017
+ "label": "2 classes"
1018
+ },
1019
+ {
1020
+ "source": "llama",
1021
+ "target": "janus",
1022
+ "label": "1 classes"
1023
+ },
1024
+ {
1025
+ "source": "siglip",
1026
+ "target": "janus",
1027
+ "label": "4 classes"
1028
+ },
1029
+ {
1030
+ "source": "wav2vec2",
1031
+ "target": "wav2vec2_conformer",
1032
+ "label": "13 classes"
1033
+ },
1034
+ {
1035
+ "source": "clip",
1036
+ "target": "mlcd",
1037
+ "label": "7 classes"
1038
+ },
1039
+ {
1040
+ "source": "llama",
1041
+ "target": "mlcd",
1042
+ "label": "1 classes"
1043
+ },
1044
+ {
1045
+ "source": "qwen2_vl",
1046
+ "target": "mlcd",
1047
+ "label": "2 classes"
1048
+ },
1049
+ {
1050
+ "source": "gemma2",
1051
+ "target": "gemma3",
1052
+ "label": "10 classes"
1053
+ },
1054
+ {
1055
+ "source": "paligemma",
1056
+ "target": "gemma3",
1057
+ "label": "4 classes"
1058
+ },
1059
+ {
1060
+ "source": "instructblip",
1061
+ "target": "instructblipvideo",
1062
+ "label": "9 classes"
1063
+ },
1064
+ {
1065
+ "source": "auto",
1066
+ "target": "instructblipvideo",
1067
+ "label": "1 classes"
1068
+ },
1069
+ {
1070
+ "source": "glm4",
1071
+ "target": "glm4v",
1072
+ "label": "3 classes"
1073
+ },
1074
+ {
1075
+ "source": "qwen2_5_vl",
1076
+ "target": "glm4v",
1077
+ "label": "16 classes"
1078
+ },
1079
+ {
1080
+ "source": "llama",
1081
+ "target": "exaone4",
1082
+ "label": "10 classes"
1083
+ },
1084
+ {
1085
+ "source": "olmo2",
1086
+ "target": "exaone4",
1087
+ "label": "2 classes"
1088
+ },
1089
+ {
1090
+ "source": "cohere",
1091
+ "target": "glm4_moe",
1092
+ "label": "1 classes"
1093
+ },
1094
+ {
1095
+ "source": "deepseek_v3",
1096
+ "target": "glm4_moe",
1097
+ "label": "7 classes"
1098
+ },
1099
+ {
1100
+ "source": "gpt_neox",
1101
+ "target": "glm4_moe",
1102
+ "label": "1 classes"
1103
+ },
1104
+ {
1105
+ "source": "detr",
1106
+ "target": "conditional_detr",
1107
+ "label": "1 classes"
1108
+ },
1109
+ {
1110
+ "source": "detr",
1111
+ "target": "grounding_dino",
1112
+ "label": "1 classes"
1113
+ },
1114
+ {
1115
+ "source": "gemma",
1116
+ "target": "qwen3",
1117
+ "label": "1 classes"
1118
+ },
1119
+ {
1120
+ "source": "llama",
1121
+ "target": "qwen3",
1122
+ "label": "1 classes"
1123
+ },
1124
+ {
1125
+ "source": "qwen2",
1126
+ "target": "qwen3",
1127
+ "label": "10 classes"
1128
+ },
1129
+ {
1130
+ "source": "llava_next",
1131
+ "target": "llava_onevision",
1132
+ "label": "1 classes"
1133
+ },
1134
+ {
1135
+ "source": "llava_next_video",
1136
+ "target": "llava_onevision",
1137
+ "label": "9 classes"
1138
+ },
1139
+ {
1140
+ "source": "llava",
1141
+ "target": "vipllava",
1142
+ "label": "5 classes"
1143
+ },
1144
+ {
1145
+ "source": "detr",
1146
+ "target": "deformable_detr",
1147
+ "label": "1 classes"
1148
+ },
1149
+ {
1150
+ "source": "llava",
1151
+ "target": "perception_lm",
1152
+ "label": "5 classes"
1153
+ },
1154
+ {
1155
+ "source": "wav2vec2",
1156
+ "target": "wavlm",
1157
+ "label": "9 classes"
1158
+ },
1159
+ {
1160
+ "source": "llama",
1161
+ "target": "glm",
1162
+ "label": "4 classes"
1163
+ },
1164
+ {
1165
+ "source": "phi3",
1166
+ "target": "glm",
1167
+ "label": "1 classes"
1168
+ },
1169
+ {
1170
+ "source": "llama",
1171
+ "target": "timesfm",
1172
+ "label": "1 classes"
1173
+ },
1174
+ {
1175
+ "source": "phi4_multimodal",
1176
+ "target": "timesfm",
1177
+ "label": "1 classes"
1178
+ },
1179
+ {
1180
+ "source": "image_processing_base",
1181
+ "target": "dpt",
1182
+ "label": "1 classes"
1183
+ },
1184
+ {
1185
+ "source": "beit",
1186
+ "target": "dpt",
1187
+ "label": "1 classes"
1188
+ },
1189
+ {
1190
+ "source": "llama",
1191
+ "target": "gemma",
1192
+ "label": "5 classes"
1193
+ },
1194
+ {
1195
+ "source": "llama",
1196
+ "target": "kyutai_speech_to_text",
1197
+ "label": "1 classes"
1198
+ },
1199
+ {
1200
+ "source": "mimi",
1201
+ "target": "kyutai_speech_to_text",
1202
+ "label": "1 classes"
1203
+ },
1204
+ {
1205
+ "source": "moshi",
1206
+ "target": "kyutai_speech_to_text",
1207
+ "label": "2 classes"
1208
+ },
1209
+ {
1210
+ "source": "llama",
1211
+ "target": "granite",
1212
+ "label": "5 classes"
1213
+ },
1214
+ {
1215
+ "source": "idefics3",
1216
+ "target": "smolvlm",
1217
+ "label": "9 classes"
1218
+ },
1219
+ {
1220
+ "source": "granitemoe",
1221
+ "target": "granitemoeshared",
1222
+ "label": "4 classes"
1223
+ },
1224
+ {
1225
+ "source": "glm",
1226
+ "target": "moonshine",
1227
+ "label": "3 classes"
1228
+ },
1229
+ {
1230
+ "source": "llama",
1231
+ "target": "moonshine",
1232
+ "label": "3 classes"
1233
+ },
1234
+ {
1235
+ "source": "whisper",
1236
+ "target": "moonshine",
1237
+ "label": "2 classes"
1238
+ },
1239
+ {
1240
+ "source": "llava",
1241
+ "target": "aya_vision",
1242
+ "label": "6 classes"
1243
+ },
1244
+ {
1245
+ "source": "deepseek_v3",
1246
+ "target": "dots1",
1247
+ "label": "5 classes"
1248
+ },
1249
+ {
1250
+ "source": "qwen3",
1251
+ "target": "dots1",
1252
+ "label": "6 classes"
1253
+ },
1254
+ {
1255
+ "source": "mistral",
1256
+ "target": "starcoder2",
1257
+ "label": "9 classes"
1258
+ },
1259
+ {
1260
+ "source": "modeling_outputs",
1261
+ "target": "sam_hq",
1262
+ "label": "1 classes"
1263
+ },
1264
+ {
1265
+ "source": "sam",
1266
+ "target": "sam_hq",
1267
+ "label": "15 classes"
1268
+ },
1269
+ {
1270
+ "source": "wav2vec2",
1271
+ "target": "wav2vec2_bert",
1272
+ "label": "3 classes"
1273
+ },
1274
+ {
1275
+ "source": "wav2vec2_conformer",
1276
+ "target": "wav2vec2_bert",
1277
+ "label": "6 classes"
1278
+ },
1279
+ {
1280
+ "source": "mistral",
1281
+ "target": "mixtral",
1282
+ "label": "9 classes"
1283
+ },
1284
+ {
1285
+ "source": "chameleon",
1286
+ "target": "emu3",
1287
+ "label": "2 classes"
1288
+ },
1289
+ {
1290
+ "source": "llama",
1291
+ "target": "emu3",
1292
+ "label": "5 classes"
1293
+ },
1294
+ {
1295
+ "source": "siglip",
1296
+ "target": "emu3",
1297
+ "label": "1 classes"
1298
+ },
1299
+ {
1300
+ "source": "paligemma",
1301
+ "target": "colpali",
1302
+ "label": "3 classes"
1303
+ },
1304
+ {
1305
+ "source": "phi3",
1306
+ "target": "phi4_multimodal",
1307
+ "label": "7 classes"
1308
+ },
1309
+ {
1310
+ "source": "siglip",
1311
+ "target": "phi4_multimodal",
1312
+ "label": "9 classes"
1313
+ },
1314
+ {
1315
+ "source": "qwen2_audio",
1316
+ "target": "voxtral",
1317
+ "label": "4 classes"
1318
+ },
1319
+ {
1320
+ "source": "idefics",
1321
+ "target": "deepseek_vl",
1322
+ "label": "2 classes"
1323
+ },
1324
+ {
1325
+ "source": "janus",
1326
+ "target": "deepseek_vl",
1327
+ "label": "4 classes"
1328
+ },
1329
+ {
1330
+ "source": "glm",
1331
+ "target": "glm4",
1332
+ "label": "4 classes"
1333
+ },
1334
+ {
1335
+ "source": "phi3",
1336
+ "target": "glm4",
1337
+ "label": "1 classes"
1338
+ },
1339
+ {
1340
+ "source": "gemma2",
1341
+ "target": "t5gemma",
1342
+ "label": "9 classes"
1343
+ },
1344
+ {
1345
+ "source": "auto",
1346
+ "target": "lightglue",
1347
+ "label": "1 classes"
1348
+ },
1349
+ {
1350
+ "source": "clip",
1351
+ "target": "lightglue",
1352
+ "label": "1 classes"
1353
+ },
1354
+ {
1355
+ "source": "cohere",
1356
+ "target": "lightglue",
1357
+ "label": "1 classes"
1358
+ },
1359
+ {
1360
+ "source": "llama",
1361
+ "target": "lightglue",
1362
+ "label": "2 classes"
1363
+ },
1364
+ {
1365
+ "source": "superglue",
1366
+ "target": "lightglue",
1367
+ "label": "2 classes"
1368
+ },
1369
+ {
1370
+ "source": "llava_next",
1371
+ "target": "llava_next_video",
1372
+ "label": "7 classes"
1373
+ },
1374
+ {
1375
+ "source": "rt_detr",
1376
+ "target": "hgnet_v2",
1377
+ "label": "1 classes"
1378
+ },
1379
+ {
1380
+ "source": "deepseek_vl",
1381
+ "target": "deepseek_vl_hybrid",
1382
+ "label": "7 classes"
1383
+ },
1384
+ {
1385
+ "source": "idefics",
1386
+ "target": "deepseek_vl_hybrid",
1387
+ "label": "2 classes"
1388
+ },
1389
+ {
1390
+ "source": "sam",
1391
+ "target": "deepseek_vl_hybrid",
1392
+ "label": "2 classes"
1393
+ },
1394
+ {
1395
+ "source": "wav2vec2",
1396
+ "target": "data2vec",
1397
+ "label": "11 classes"
1398
+ },
1399
+ {
1400
+ "source": "depth_anything",
1401
+ "target": "prompt_depth_anything",
1402
+ "label": "7 classes"
1403
+ },
1404
+ {
1405
+ "source": "gemma",
1406
+ "target": "modernbert",
1407
+ "label": "2 classes"
1408
+ },
1409
+ {
1410
+ "source": "bamba",
1411
+ "target": "lfm2",
1412
+ "label": "1 classes"
1413
+ },
1414
+ {
1415
+ "source": "llama",
1416
+ "target": "lfm2",
1417
+ "label": "8 classes"
1418
+ },
1419
+ {
1420
+ "source": "wav2vec2",
1421
+ "target": "sew",
1422
+ "label": "11 classes"
1423
+ },
1424
+ {
1425
+ "source": "wav2vec2",
1426
+ "target": "hubert",
1427
+ "label": "7 classes"
1428
+ },
1429
+ {
1430
+ "source": "gemma",
1431
+ "target": "gemma2",
1432
+ "label": "9 classes"
1433
+ },
1434
+ {
1435
+ "source": "detr",
1436
+ "target": "rt_detr",
1437
+ "label": "2 classes"
1438
+ },
1439
+ {
1440
+ "source": "rt_detr",
1441
+ "target": "d_fine",
1442
+ "label": "12 classes"
1443
+ },
1444
+ {
1445
+ "source": "rt_detr_v2",
1446
+ "target": "d_fine",
1447
+ "label": "1 classes"
1448
+ },
1449
+ {
1450
+ "source": "llava",
1451
+ "target": "mistral3",
1452
+ "label": "6 classes"
1453
+ },
1454
+ {
1455
+ "source": "mistral",
1456
+ "target": "mistral3",
1457
+ "label": "1 classes"
1458
+ },
1459
+ {
1460
+ "source": "modernbert",
1461
+ "target": "modernbert_decoder",
1462
+ "label": "6 classes"
1463
+ },
1464
+ {
1465
+ "source": "llama",
1466
+ "target": "aria",
1467
+ "label": "8 classes"
1468
+ },
1469
+ {
1470
+ "source": "llava",
1471
+ "target": "aria",
1472
+ "label": "4 classes"
1473
+ },
1474
+ {
1475
+ "source": "llava_next",
1476
+ "target": "aria",
1477
+ "label": "1 classes"
1478
+ },
1479
+ {
1480
+ "source": "siglip",
1481
+ "target": "siglip2",
1482
+ "label": "16 classes"
1483
+ },
1484
+ {
1485
+ "source": "llama",
1486
+ "target": "arcee",
1487
+ "label": "5 classes"
1488
+ },
1489
+ {
1490
+ "source": "nemotron",
1491
+ "target": "arcee",
1492
+ "label": "1 classes"
1493
+ },
1494
+ {
1495
+ "source": "mamba",
1496
+ "target": "falcon_mamba",
1497
+ "label": "10 classes"
1498
+ },
1499
+ {
1500
+ "source": "llama",
1501
+ "target": "deepseek_v2",
1502
+ "label": "9 classes"
1503
+ },
1504
+ {
1505
+ "source": "llama4",
1506
+ "target": "deepseek_v2",
1507
+ "label": "1 classes"
1508
+ },
1509
+ {
1510
+ "source": "bart",
1511
+ "target": "informer",
1512
+ "label": "1 classes"
1513
+ },
1514
+ {
1515
+ "source": "time_series_transformer",
1516
+ "target": "informer",
1517
+ "label": "12 classes"
1518
+ },
1519
+ {
1520
+ "source": "colpali",
1521
+ "target": "colqwen2",
1522
+ "label": "3 classes"
1523
+ },
1524
+ {
1525
+ "source": "bamba",
1526
+ "target": "granitemoehybrid",
1527
+ "label": "4 classes"
1528
+ },
1529
+ {
1530
+ "source": "granitemoeshared",
1531
+ "target": "granitemoehybrid",
1532
+ "label": "7 classes"
1533
+ },
1534
+ {
1535
+ "source": "bart",
1536
+ "target": "plbart",
1537
+ "label": "5 classes"
1538
+ },
1539
+ {
1540
+ "source": "bigbird_pegasus",
1541
+ "target": "plbart",
1542
+ "label": "1 classes"
1543
+ },
1544
+ {
1545
+ "source": "mbart",
1546
+ "target": "plbart",
1547
+ "label": "1 classes"
1548
+ },
1549
+ {
1550
+ "source": "llama",
1551
+ "target": "qwen3_moe",
1552
+ "label": "4 classes"
1553
+ },
1554
+ {
1555
+ "source": "mixtral",
1556
+ "target": "qwen3_moe",
1557
+ "label": "3 classes"
1558
+ },
1559
+ {
1560
+ "source": "qwen2_moe",
1561
+ "target": "qwen3_moe",
1562
+ "label": "1 classes"
1563
+ },
1564
+ {
1565
+ "source": "qwen3",
1566
+ "target": "qwen3_moe",
1567
+ "label": "1 classes"
1568
+ },
1569
+ {
1570
+ "source": "clip",
1571
+ "target": "internvl",
1572
+ "label": "1 classes"
1573
+ },
1574
+ {
1575
+ "source": "janus",
1576
+ "target": "internvl",
1577
+ "label": "1 classes"
1578
+ },
1579
+ {
1580
+ "source": "llama",
1581
+ "target": "internvl",
1582
+ "label": "1 classes"
1583
+ },
1584
+ {
1585
+ "source": "llava",
1586
+ "target": "internvl",
1587
+ "label": "5 classes"
1588
+ },
1589
+ {
1590
+ "source": "glm",
1591
+ "target": "ernie4_5",
1592
+ "label": "1 classes"
1593
+ },
1594
+ {
1595
+ "source": "llama",
1596
+ "target": "ernie4_5",
1597
+ "label": "4 classes"
1598
+ },
1599
+ {
1600
+ "source": "dinov2",
1601
+ "target": "eomt",
1602
+ "label": "4 classes"
1603
+ },
1604
+ {
1605
+ "source": "mask2former",
1606
+ "target": "eomt",
1607
+ "label": "2 classes"
1608
+ },
1609
+ {
1610
+ "source": "siglip",
1611
+ "target": "eomt",
1612
+ "label": "1 classes"
1613
+ },
1614
+ {
1615
+ "source": "vit",
1616
+ "target": "eomt",
1617
+ "label": "1 classes"
1618
+ },
1619
+ {
1620
+ "source": "llama",
1621
+ "target": "dia",
1622
+ "label": "4 classes"
1623
+ },
1624
+ {
1625
+ "source": "phi3",
1626
+ "target": "dia",
1627
+ "label": "1 classes"
1628
+ },
1629
+ {
1630
+ "source": "llama",
1631
+ "target": "deepseek_v3",
1632
+ "label": "9 classes"
1633
+ },
1634
+ {
1635
+ "source": "jamba",
1636
+ "target": "bamba",
1637
+ "label": "2 classes"
1638
+ },
1639
+ {
1640
+ "source": "llama",
1641
+ "target": "bamba",
1642
+ "label": "6 classes"
1643
+ },
1644
+ {
1645
+ "source": "mamba2",
1646
+ "target": "bamba",
1647
+ "label": "4 classes"
1648
+ },
1649
+ {
1650
+ "source": "llama",
1651
+ "target": "olmo2",
1652
+ "label": "3 classes"
1653
+ },
1654
+ {
1655
+ "source": "olmo",
1656
+ "target": "olmo2",
1657
+ "label": "7 classes"
1658
+ },
1659
+ {
1660
+ "source": "clip",
1661
+ "target": "aimv2",
1662
+ "label": "3 classes"
1663
+ },
1664
+ {
1665
+ "source": "llama",
1666
+ "target": "aimv2",
1667
+ "label": "2 classes"
1668
+ },
1669
+ {
1670
+ "source": "siglip",
1671
+ "target": "aimv2",
1672
+ "label": "6 classes"
1673
+ },
1674
+ {
1675
+ "source": "gemma",
1676
+ "target": "diffllama",
1677
+ "label": "1 classes"
1678
+ },
1679
+ {
1680
+ "source": "llama",
1681
+ "target": "diffllama",
1682
+ "label": "8 classes"
1683
+ },
1684
+ {
1685
+ "source": "mistral",
1686
+ "target": "diffllama",
1687
+ "label": "1 classes"
1688
+ },
1689
+ {
1690
+ "source": "rt_detr",
1691
+ "target": "rt_detr_v2",
1692
+ "label": "6 classes"
1693
+ },
1694
+ {
1695
+ "source": "vit",
1696
+ "target": "ijepa",
1697
+ "label": "3 classes"
1698
+ },
1699
+ {
1700
+ "source": "llama",
1701
+ "target": "smollm3",
1702
+ "label": "9 classes"
1703
+ },
1704
+ {
1705
+ "source": "qwen2",
1706
+ "target": "smollm3",
1707
+ "label": "1 classes"
1708
+ },
1709
+ {
1710
+ "source": "cohere",
1711
+ "target": "cohere2",
1712
+ "label": "8 classes"
1713
+ },
1714
+ {
1715
+ "source": "gemma2",
1716
+ "target": "cohere2",
1717
+ "label": "1 classes"
1718
+ },
1719
+ {
1720
+ "source": "bart",
1721
+ "target": "biogpt",
1722
+ "label": "3 classes"
1723
+ },
1724
+ {
1725
+ "source": "opt",
1726
+ "target": "biogpt",
1727
+ "label": "1 classes"
1728
+ },
1729
+ {
1730
+ "source": "detr",
1731
+ "target": "yolos",
1732
+ "label": "1 classes"
1733
+ },
1734
+ {
1735
+ "source": "wav2vec2",
1736
+ "target": "unispeech_sat",
1737
+ "label": "11 classes"
1738
+ },
1739
+ {
1740
+ "source": "llama",
1741
+ "target": "qwen2",
1742
+ "label": "10 classes"
1743
+ },
1744
+ {
1745
+ "source": "mistral",
1746
+ "target": "qwen2",
1747
+ "label": "1 classes"
1748
+ },
1749
+ {
1750
+ "source": "llama",
1751
+ "target": "cohere",
1752
+ "label": "6 classes"
1753
+ },
1754
+ {
1755
+ "source": "llama",
1756
+ "target": "qwen2_5_omni",
1757
+ "label": "1 classes"
1758
+ },
1759
+ {
1760
+ "source": "qwen2_5_vl",
1761
+ "target": "qwen2_5_omni",
1762
+ "label": "8 classes"
1763
+ },
1764
+ {
1765
+ "source": "qwen2_audio",
1766
+ "target": "qwen2_5_omni",
1767
+ "label": "2 classes"
1768
+ },
1769
+ {
1770
+ "source": "qwen2_vl",
1771
+ "target": "qwen2_5_omni",
1772
+ "label": "1 classes"
1773
+ },
1774
+ {
1775
+ "source": "dinov2",
1776
+ "target": "dinov2_with_registers",
1777
+ "label": "6 classes"
1778
+ }
1779
+ ]
1780
+ };
1781
+ const hfLogoPath = "M21.2,6.7c-0.2-0.2-0.5-0.3-0.8-0.3H3.6C3.3,6.4,3,6.5,2.8,6.7s-0.3,0.5-0.3,0.8v10.8c0,0.3,0.1,0.5,0.3,0.8 c0.2,0.2,0.5,0.3,0.8,0.3h16.8c0.3,0,0.5-0.1,0.8-0.3c0.2-0.2,0.3-0.5,0.3-0.8V7.5C21.5,7.2,21.4,6.9,21.2,6.7z M12,17.8L5.9,9.4h3.1 V8.3h6v1.1h3.1L12,17.8z"; // kept for potential future use
1782
+
1783
+ const width = window.innerWidth;
1784
+ const height = window.innerHeight;
1785
+
1786
+ const svg = d3.select('#dependency-graph')
1787
+ .call(
1788
+ d3.zoom().on('zoom', (event) => {
1789
+ g.attr('transform', event.transform);
1790
+ })
1791
+ );
1792
+
1793
+ const g = svg.append('g');
1794
+
1795
+ // Forces – tweaked for tighter graph
1796
+ const simulation = d3.forceSimulation(graphData.nodes)
1797
+ .force('link', d3.forceLink(graphData.links).id(d => d.id).distance(500))
1798
+ .force('charge', d3.forceManyBody().strength(-500))
1799
+ .force('center', d3.forceCenter(width / 2, height / 2))
1800
+ .force('collide', d3.forceCollide(0.01 * parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--base‑size'))));
1801
+
1802
+ // Links
1803
+ const link = g.append('g')
1804
+ .selectAll('line')
1805
+ .data(graphData.links)
1806
+ .join('line')
1807
+ .attr('class', 'link')
1808
+ .attr('stroke-width', 1.5);
1809
+
1810
+ // Link‑labels (#classes)
1811
+ const linkLabel = g.append('g')
1812
+ .selectAll('text')
1813
+ .data(graphData.links)
1814
+ .join('text')
1815
+ .attr('class', 'link-label')
1816
+ .text(d => d.label);
1817
+
1818
+ // Nodes (base vs derived)
1819
+ const node = g.append('g')
1820
+ .selectAll('g')
1821
+ .data(graphData.nodes)
1822
+ .join('g')
1823
+ .attr('class', d => d.is_base ? 'node base' : 'node derived')
1824
+ .call(d3.drag()
1825
+ .on('start', dragstarted)
1826
+ .on('drag', dragged)
1827
+ .on('end', dragended)
1828
+ );
1829
+
1830
+ // Base‑model icon (styled circle instead of external image)
1831
+ node.filter(d => d.is_base)
1832
+ .append('circle')
1833
+ .attr('r', parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--base‑size')) / 2)
1834
+ .attr('fill', '#FFD21E')
1835
+ .attr('stroke', '#FF9D00')
1836
+ .attr('stroke-width', 3);
1837
+
1838
+ // Add 🤗 emoji as text for base models
1839
+ node.filter(d => d.is_base)
1840
+ .append('text')
1841
+ .attr('text-anchor', 'middle')
1842
+ .attr('dy', '0.35em')
1843
+ .style('font-size', '24px')
1844
+ .text('🤗');
1845
+
1846
+ // Base‑model label (below icon)
1847
+ node.filter(d => d.is_base)
1848
+ .append('text')
1849
+ .attr('class', 'node-label')
1850
+ .attr('y', parseFloat(getComputedStyle(document.documentElement).getPropertyValue('--base‑size')) / 2 + 8)
1851
+ .style('font-size', '40px')
1852
+ .text(d => d.id);
1853
+
1854
+ // Derived‑model circle + label w/ background rect
1855
+ const derived = node.filter(d => !d.is_base);
1856
+
1857
+ derived.append('circle')
1858
+ .attr('r', d => 20 * d.size); // scaled
1859
+
1860
+ const labelGroup = derived.append('g').attr('class', 'label-group');
1861
+ labelGroup.append('rect')
1862
+ .attr('x', -45)
1863
+ .attr('y', -18)
1864
+ .attr('width', 90)
1865
+ .attr('height', 36)
1866
+ .attr('rx', 8)
1867
+ .attr('fill', '#fffbe6')
1868
+ .attr('stroke', '#ccc');
1869
+ labelGroup.append('text')
1870
+ .attr('class', 'node-label')
1871
+ .attr('dy', '0.35em')
1872
+ .style('font-size', '18px')
1873
+ .text(d => d.id);
1874
+
1875
+ // Tick
1876
+ simulation.on('tick', () => {
1877
+ link.attr('x1', d => d.source.x)
1878
+ .attr('y1', d => d.source.y)
1879
+ .attr('x2', d => d.target.x)
1880
+ .attr('y2', d => d.target.y);
1881
+
1882
+ linkLabel.attr('x', d => (d.source.x + d.target.x) / 2)
1883
+ .attr('y', d => (d.source.y + d.target.y) / 2);
1884
+
1885
+ node.attr('transform', d => `translate(${d.x}, ${d.y})`);
1886
+ });
1887
+
1888
+ // Drag helpers
1889
+ function dragstarted(event, d) {
1890
+ if (!event.active) simulation.alphaTarget(0.3).restart();
1891
+ d.fx = d.x; d.fy = d.y;
1892
+ }
1893
+ function dragged(event, d) {
1894
+ d.fx = event.x; d.fy = event.y;
1895
+ }
1896
+ function dragended(event, d) {
1897
+ if (!event.active) simulation.alphaTarget(0);
1898
+ d.fx = null; d.fy = null;
1899
+ }
1900
+ </script>
1901
+ </body>
1902
+ </html>
dist/static/graph_modular_related_models.png ADDED

Git LFS Details

  • SHA256: bd824e584f1a036c4e7f1118de90697582fd6c31131c1a5d2ebc0a9ea30b27ce
  • Pointer size: 131 Bytes
  • Size of remote file: 124 kB
dist/static/hf-logo.svg ADDED
dist/static/model_debugger.png ADDED

Git LFS Details

  • SHA256: 7f62f8f9b3dfdd62463cda84144e2ea144cdc883cfccd0c4b737ead173ecc7c6
  • Pointer size: 131 Bytes
  • Size of remote file: 142 kB
dist/static/modular_candidates.png ADDED

Git LFS Details

  • SHA256: b209e24b01c98890a00361cf49d80365afc30b7f469271a52aee92ca2f905027
  • Pointer size: 131 Bytes
  • Size of remote file: 171 kB
dist/static/popular_models_barplot.png ADDED

Git LFS Details

  • SHA256: 2c75ec717c86a3c71f95f4686f5d27f5ed14ceb875f4438283095ce4cbfee299
  • Pointer size: 130 Bytes
  • Size of remote file: 43.5 kB
dist/style.css ADDED
@@ -0,0 +1,741 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* style.css - Transformers Playthrough */
2
+
3
+ /* Import ultrascale-playbook base styles and add transformers-specific styling */
4
+ /* Define colors */
5
+ :root {
6
+ --distill-gray: rgb(107, 114, 128);
7
+ --distill-gray-light: rgb(185, 185, 185);
8
+ --distill-gray-lighter: rgb(228, 228, 228);
9
+ --distill-gray-lightest: rgb(245, 245, 245);
10
+ --distill-blue: #007BFF;
11
+ }
12
+
13
+ /* Container for the controls */
14
+ [id^="plot-"] {
15
+ display: flex;
16
+ flex-direction: column;
17
+ align-items: center;
18
+ gap: 15px; /* Adjust the gap between controls as needed */
19
+ }
20
+ [id^="plot-"] figure {
21
+ margin-bottom: 0px;
22
+ margin-top: 0px;
23
+ padding: 0px;
24
+ }
25
+ .plotly_caption {
26
+ font-style: italic;
27
+ margin-top: 10px;
28
+ }
29
+
30
+ .plotly_controls {
31
+ display: flex;
32
+ flex-wrap: wrap;
33
+ flex-direction: row;
34
+ justify-content: center;
35
+ align-items: flex-start;
36
+ gap: 30px;
37
+ }
38
+
39
+
40
+ .plotly_input_container {
41
+ display: flex;
42
+ align-items: center;
43
+ flex-direction: column;
44
+ gap: 10px;
45
+ }
46
+
47
+ /* Style for the select dropdown */
48
+ .plotly_input_container > select {
49
+ padding: 2px 4px;
50
+ /* border: 1px solid #ccc; */
51
+ line-height: 1.5em;
52
+ text-align: center;
53
+ border-radius: 4px;
54
+ font-size: 12px;
55
+ background-color: var(--distill-gray-lightest);
56
+ outline: none;
57
+ }
58
+
59
+ /* Style for the range input */
60
+
61
+ .plotly_slider {
62
+ display: flex;
63
+ align-items: center;
64
+ gap: 10px;
65
+ }
66
+
67
+ .plotly_slider > input[type="range"] {
68
+ -webkit-appearance: none;
69
+ height: 2px;
70
+ background: var(--distill-gray-light);
71
+ border-radius: 5px;
72
+ outline: none;
73
+ }
74
+
75
+ .plotly_slider > span {
76
+ font-size: 14px;
77
+ line-height: 1.6em;
78
+ min-width: 16px;
79
+ }
80
+
81
+ .plotly_slider > input[type="range"]::-webkit-slider-thumb {
82
+ -webkit-appearance: none;
83
+ appearance: none;
84
+ width: 18px;
85
+ height: 18px;
86
+ border-radius: 50%;
87
+ background: var(--distill-blue);
88
+ cursor: pointer;
89
+ }
90
+
91
+ .plotly_slider > input[type="range"]::-moz-range-thumb {
92
+ width: 18px;
93
+ height: 18px;
94
+ border-radius: 50%;
95
+ background: var(--distill-blue);
96
+ cursor: pointer;
97
+ }
98
+
99
+ /* Style for the labels */
100
+ .plotly_input_container > label {
101
+ font-size: 14px;
102
+ font-weight: bold;
103
+ }
104
+
105
+ .main-plot-container {
106
+ margin-top: 21px;
107
+ margin-bottom: 35px;
108
+ }
109
+
110
+ .main-plot-container > figure {
111
+ display: block !important;
112
+ /* Let this be handled by graph-container */
113
+ margin-bottom: 0px;
114
+ margin-top: 0px;
115
+ }
116
+ .main-plot-container > div {
117
+ display: none !important;
118
+ }
119
+
120
+
121
+ @media (min-width: 768px) {
122
+ .main-plot-container > figure {
123
+ display: none !important;
124
+ }
125
+ .main-plot-container > div {
126
+ display: flex !important;
127
+ }
128
+ }
129
+
130
+ d-byline .byline {
131
+ grid-template-columns: 1fr;
132
+ grid-column: text;
133
+ font-size: 0.9rem;
134
+ line-height: 1.8em;
135
+ }
136
+
137
+ @media (min-width: 768px) {
138
+ d-byline .byline {
139
+ grid-template-columns: 5fr 1fr 1fr;
140
+ }
141
+ }
142
+
143
+ #title-plot {
144
+ margin-top: 0px;
145
+ margin-bottom: 0px;
146
+ }
147
+
148
+ d-contents > nav a.active {
149
+ text-decoration: underline;
150
+ }
151
+
152
+ @media (max-width: 1199px) {
153
+ d-contents {
154
+ display: none;
155
+ background: white;
156
+ justify-self: start;
157
+ align-self: start;
158
+ padding-bottom: 0.5em;
159
+ margin-bottom: 1em;
160
+ padding-left: 0.25em;
161
+ border-bottom: 1px solid rgba(0, 0, 0, 0.1);
162
+ border-bottom-width: 1px;
163
+ border-bottom-style: solid;
164
+ border-bottom-color: rgba(0, 0, 0, 0.1);
165
+ overflow-y: scroll;
166
+ height: calc(100vh - 40px);
167
+ scrollbar-width: none;
168
+ z-index: -100;
169
+ }
170
+ }
171
+
172
+ d-contents a:hover {
173
+ border-bottom: none;
174
+ }
175
+
176
+ toc-title {
177
+ font-weight: bold;
178
+ font-size: 1.2em;
179
+ color: #333;
180
+ }
181
+
182
+ toggle-icon {
183
+ transition: transform 0.3s;
184
+ }
185
+
186
+ toggle-icon.collapsed {
187
+ transform: rotate(90deg);
188
+ }
189
+
190
+ .toc-content {
191
+ margin-top: 15px;
192
+ overflow: hidden;
193
+ /* max-height: 1000px; */
194
+ transition: max-height 0.3s ease-out;
195
+ }
196
+
197
+ .toc-content.collapsed {
198
+ max-height: 0;
199
+ margin-top: 0;
200
+ }
201
+
202
+ @media (min-width: 1200px) {
203
+ d-article {
204
+ /* Ensure d-article does not prevent sticky positioning */
205
+ overflow: visible;
206
+ }
207
+
208
+ d-contents {
209
+ align-self: start;
210
+ background: white;
211
+ grid-column-start: 1 !important;
212
+ grid-column-end: 4 !important;
213
+ grid-row: auto / span 6;
214
+ justify-self: end;
215
+ margin-top: 0em;
216
+ padding-right: 3em;
217
+ padding-left: 2em;
218
+ /* border-right: 1px solid rgba(0, 0, 0, 0.1);
219
+ border-right-width: 1px;
220
+ border-right-style: solid;
221
+ border-right-color: rgba(0, 0, 0, 0.1); */
222
+ position: -webkit-sticky; /* For Safari */
223
+ position: sticky;
224
+ top: 10px; /* Adjust this value if needed */
225
+ overflow-y: auto;
226
+ height: calc(100vh - 40px);
227
+ scrollbar-width: none;
228
+ transition: max-height 0.3s ease-out;
229
+ z-index: -100;
230
+ }
231
+ }
232
+
233
+ d-contents nav h3 {
234
+ margin-top: 0;
235
+ margin-bottom: 1em;
236
+ }
237
+
238
+ d-contents nav div div {
239
+ color: rgba(0, 0, 0, 0.8);
240
+ font-weight: bold;
241
+ }
242
+
243
+ d-contents nav a {
244
+ color: rgba(0, 0, 0, 0.8);
245
+ border-bottom: none;
246
+ text-decoration: none;
247
+ }
248
+
249
+ d-contents li {
250
+ list-style-type: none;
251
+ }
252
+
253
+ d-contents ul, d-article d-contents ul {
254
+ padding-left: 1em;
255
+ }
256
+
257
+ d-contents nav ul li {
258
+ margin-bottom: .25em;
259
+ }
260
+
261
+ d-contents nav a:hover {
262
+ text-decoration: underline solid rgba(0, 0, 0, 0.6);
263
+ }
264
+
265
+ d-contents nav ul {
266
+ margin-top: 0;
267
+ margin-bottom: 6px;
268
+ }
269
+
270
+
271
+ d-contents nav > div {
272
+ display: block;
273
+ outline: none;
274
+ margin-bottom: 0.5em;
275
+ }
276
+
277
+ d-contents nav > div > a {
278
+ font-size: 13px;
279
+ font-weight: 600;
280
+ }
281
+
282
+ d-article aside {
283
+ margin-bottom: 1em;
284
+ }
285
+
286
+ d-article img {
287
+ max-width: 100%;
288
+ }
289
+
290
+ @media (min-width: 768px) {
291
+ d-article aside {
292
+ margin-bottom: 0;
293
+ }
294
+ }
295
+
296
+ d-contents nav > div > a:hover,
297
+ d-contents nav > ul > li > a:hover {
298
+ text-decoration: none;
299
+ }
300
+
301
+ .note-box {
302
+ background-color: #f6f8fa;
303
+ border-left: 4px solid #444444;
304
+ padding: 1rem;
305
+ margin: 1rem 0; /* Keep this modest margin */
306
+ border-radius: 6px;
307
+ /* Add this to ensure the box only takes up needed space */
308
+ display: inline-block;
309
+ }
310
+
311
+ .note-box-title {
312
+ margin: 0;
313
+ color: #444444;
314
+ font-weight: 600;
315
+ font-size: 1em;
316
+ }
317
+
318
+ .note-box-content {
319
+ margin-top: 0.5rem;
320
+ margin-bottom: 0; /* Ensure no bottom margin */
321
+ color: #24292f;
322
+ font-size: 0.9em;
323
+ line-height: 1.5em;
324
+ }
325
+
326
+ /* For dark mode support */
327
+ @media (prefers-color-scheme: dark) {
328
+ .note-box {
329
+ background-color: #1c1c1c;
330
+ border-left-color: #888888;
331
+ }
332
+ .note-box-title {
333
+ color: #888888;
334
+ }
335
+ .note-box-content {
336
+ color: #d4d4d4;
337
+ }
338
+ }
339
+
340
+ d-article {
341
+ font-size: 1.0em;
342
+ }
343
+
344
+ .figure-legend {
345
+ font-size: 0.9em;
346
+ font-style: italic;
347
+ color: var(--distill-gray);
348
+ line-height: 1.5em;
349
+ }
350
+
351
+ d-code {
352
+ font-size: 12px;
353
+ }
354
+
355
+ .large-image-background {
356
+ width: 100vw;
357
+ padding-top: 10px;
358
+ padding-bottom: 10px;
359
+ margin-left: calc(-50vw + 50%);
360
+ margin-right: calc(-50vw + 50%);
361
+ background: white;
362
+ height: fit-content; /* This will make it match the image height */
363
+ display: flex;
364
+ justify-content: center; /* This will center your image */
365
+ }
366
+
367
+ .large-image-background-transparent {
368
+ /* width: 100vw; */
369
+ padding-top: 10px;
370
+ padding-bottom: 10px;
371
+ /* margin-left: calc(-50vw + 50%); */
372
+ margin-left:-100px;
373
+ margin-right: -100px;
374
+ /* margin-right: calc(-50vw + 50%); */
375
+ /* background: white; */
376
+ height: fit-content; /* This will make it match the image height */
377
+ display: flex;
378
+ justify-content: center; /* This will center your image */
379
+ }
380
+
381
+ .boxed-image {
382
+ padding: 0.5rem;
383
+ background: white;
384
+ border-radius: 12px;
385
+ border: 1px solid #e5e7eb;
386
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
387
+ }
388
+
389
+ d-article li {
390
+ margin-bottom: 0.0em;
391
+ }
392
+
393
+ d-article ul ul {
394
+ margin-bottom: 0.0em;
395
+ }
396
+
397
+ d-article ol ol {
398
+ margin-bottom: 0.0em;
399
+ }
400
+
401
+ d-article hr {
402
+ grid-column: text;
403
+ }
404
+
405
+ /* Memory visualization */
406
+ #graph-all {
407
+ min-width: 500px;
408
+ margin-right: 10px;
409
+ margin-bottom: 2rem;
410
+ padding: 0.5rem;
411
+ background: #f9fafb;
412
+ border-radius: 12px;
413
+ border: 1px solid #e5e7eb;
414
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
415
+ }
416
+
417
+
418
+ /* Main container styles */
419
+ #controls {
420
+ max-width: 1200px;
421
+ /* margin: 2rem auto; */
422
+ margin-bottom: 2rem;
423
+ margin-left: 10px;
424
+ padding: 0.6rem;
425
+ background: #f9fafb;
426
+ border-radius: 12px;
427
+ border: 1px solid #e5e7eb;
428
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
429
+ }
430
+
431
+ /* Grid layout */
432
+ #controls {
433
+ display: grid;
434
+ grid-template-columns: 1fr 1fr;
435
+ /* gap: 2rem; */
436
+ }
437
+
438
+ /* Cell styles */
439
+ .cell {
440
+ margin-bottom: 0.2rem;
441
+ }
442
+
443
+ /* Label styles */
444
+ label {
445
+ display: block;
446
+ /* margin-bottom: 0.5rem; */
447
+ font-size: 0.8rem;
448
+ font-weight: 500;
449
+ color: #374151;
450
+ }
451
+
452
+ /* Input container for range + number combination */
453
+ .input-container {
454
+ display: flex;
455
+ gap: 1rem;
456
+ align-items: center;
457
+ }
458
+
459
+ /* Range input styling */
460
+ input[type="range"] {
461
+ flex: 1;
462
+ height: 6px;
463
+ background: #e5e7eb;
464
+ border-radius: 3px;
465
+ appearance: none;
466
+ outline: none;
467
+ }
468
+
469
+ input[type="range"]::-webkit-slider-thumb {
470
+ appearance: none;
471
+ width: 16px;
472
+ height: 16px;
473
+ background: #3b82f6;
474
+ border-radius: 50%;
475
+ cursor: pointer;
476
+ transition: background 0.15s ease;
477
+ }
478
+
479
+ input[type="range"]::-webkit-slider-thumb:hover {
480
+ background: #2563eb;
481
+ }
482
+
483
+ /* Number input styling */
484
+ input[type="number"] {
485
+ width: 80px;
486
+ padding: 0.5rem;
487
+ border: 1px solid #e5e7eb;
488
+ border-radius: 6px;
489
+ font-size: 0.9rem;
490
+ color: #374151;
491
+ }
492
+
493
+ /* Select styling */
494
+ select {
495
+ width: 100%;
496
+ padding: 0.5rem;
497
+ border: 1px solid #e5e7eb;
498
+ border-radius: 6px;
499
+ background: white;
500
+ font-size: 0.9rem;
501
+ color: #374151;
502
+ cursor: pointer;
503
+ }
504
+
505
+ /* Checkbox styling */
506
+ input[type="checkbox"] {
507
+ width: 1.2rem;
508
+ height: 1.2rem;
509
+ margin-right: 0.5rem;
510
+ border: 2px solid #e5e7eb;
511
+ border-radius: 4px;
512
+ cursor: pointer;
513
+ }
514
+
515
+ /* Column specific styles */
516
+ .column-1 {
517
+ padding-right: 0.5rem;
518
+ }
519
+
520
+ .column-2 {
521
+ padding-left: 0.5rem;
522
+ }
523
+
524
+ /* Checkbox container */
525
+ .checkbox-container {
526
+ display: flex;
527
+ align-items: center;
528
+ margin-bottom: 1rem;
529
+ }
530
+
531
+ /* Memory visualization styles */
532
+ .memory-block {
533
+ background: #fff;
534
+ border-radius: 8px;
535
+ padding: 1rem;
536
+ margin-bottom: 1rem;
537
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.05);
538
+ }
539
+
540
+ .memory-title {
541
+ font-size: 1.1rem;
542
+ font-weight: 500;
543
+ color: #374151;
544
+ margin-bottom: 0.5rem;
545
+ }
546
+
547
+ .memory-value {
548
+ font-size: 1.5rem;
549
+ font-weight: 600;
550
+ color: #3b82f6;
551
+ }
552
+
553
+ /* Responsive adjustments */
554
+ @media (max-width: 768px) {
555
+ #controls {
556
+ grid-template-columns: 1fr;
557
+ padding: 1rem;
558
+ }
559
+
560
+ .column-1, .column-2 {
561
+ padding: 0;
562
+ }
563
+ }
564
+
565
+ /* Hover states and transitions */
566
+ input:hover, select:hover {
567
+ border-color: #3b82f6;
568
+ }
569
+
570
+ input:focus, select:focus {
571
+ border-color: #2563eb;
572
+ outline: none;
573
+ box-shadow: 0 0 0 2px rgba(59, 130, 246, 0.1);
574
+ }
575
+
576
+ /* Add smooth transitions */
577
+ input, select, button {
578
+ transition: all 0.15s ease;
579
+ }
580
+
581
+ /* Preset dropdown special styling */
582
+ select[name="presets"] {
583
+ background-color: #f3f4f6;
584
+ font-weight: 500;
585
+ }
586
+
587
+ /* Memory graph enhancements */
588
+ .activation-memory {
589
+ background: #dbeafe;
590
+ padding: 1rem;
591
+ border-radius: 8px;
592
+ margin-bottom: 1rem;
593
+ }
594
+
595
+ .gradient-memory {
596
+ background: #ede9fe;
597
+ padding: 1rem;
598
+ border-radius: 8px;
599
+ }
600
+
601
+ .order-button-second {
602
+ background: linear-gradient(135deg, #6DB4C4, #D4A5B8);
603
+ color: white;
604
+ font-size: 18px;
605
+ font-weight: 600;
606
+ padding: 20px 20px;
607
+ border: none;
608
+ border-radius: 12px;
609
+ cursor: pointer;
610
+ text-transform: uppercase;
611
+ letter-spacing: 1px;
612
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
613
+ transition: all 0.3s ease;
614
+ position: relative;
615
+ overflow: hidden;
616
+ }
617
+ .order-button-second:hover {
618
+ transform: translateY(-2px);
619
+ box-shadow: 0 6px 20px rgba(0, 0, 0, 0.25);
620
+ }
621
+
622
+ .order-button:active {
623
+ transform: translateY(0);
624
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
625
+ }
626
+
627
+ .order-button-second::before {
628
+ content: '';
629
+ position: absolute;
630
+ top: 0;
631
+ left: -100%;
632
+ width: 100%;
633
+ height: 100%;
634
+ background: linear-gradient(135deg, rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 0));
635
+ transition: left 0.5s ease;
636
+ }
637
+
638
+ .order-button-second:hover::before {
639
+ left: 100%;
640
+ }
641
+
642
+ .order-button {
643
+ background: linear-gradient(135deg, #6DB4C4, #D4A5B8);
644
+ color: white;
645
+ font-size: 18px;
646
+ font-weight: 600;
647
+ padding: 16px 32px;
648
+ border: none;
649
+ border-radius: 12px;
650
+ cursor: pointer;
651
+ text-transform: uppercase;
652
+ letter-spacing: 1px;
653
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
654
+ transition: all 0.3s ease;
655
+ position: relative;
656
+ overflow: hidden;
657
+ }
658
+
659
+ .order-button:hover {
660
+ transform: translateY(-2px);
661
+ box-shadow: 0 6px 20px rgba(0, 0, 0, 0.25);
662
+ }
663
+
664
+ .order-button:active {
665
+ transform: translateY(0);
666
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.2);
667
+ }
668
+
669
+ .order-button::before {
670
+ content: '';
671
+ position: absolute;
672
+ top: 0;
673
+ left: -100%;
674
+ width: 100%;
675
+ height: 100%;
676
+ background: linear-gradient(135deg, rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 0));
677
+ transition: left 0.5s ease;
678
+ }
679
+
680
+ .order-button:hover::before {
681
+ left: 100%;
682
+ }
683
+ .order-button-container-second {
684
+ /* display: flex; */
685
+ justify-content: center;
686
+ margin: 0px 0;
687
+ }
688
+
689
+ .order-button-container {
690
+ display: flex;
691
+ justify-content: center;
692
+ margin: 0px 0 40px 0;
693
+ }
694
+
695
+ d-article img {
696
+ width: 100%!important;
697
+ }
698
+
699
+
700
+ iframe, .js-plotly-plot {
701
+ width: 100%!important;
702
+ margin-bottom: 20px;
703
+ }
704
+
705
+ .modebar-container {
706
+ display: none;
707
+ }
708
+
709
+ #graph-container {
710
+ display: grid; grid-template-columns: 1fr 1fr; align-items: center;
711
+ }
712
+
713
+ @media (max-width: 768px) {
714
+ #graph-container {
715
+ grid-template-columns: 1fr;
716
+ }
717
+ }
718
+
719
+ @media (max-width: 1024px) {
720
+ #graph-container {
721
+ grid-template-columns: 1fr;
722
+ }
723
+ #graph-all {
724
+ margin-right: 0px;
725
+ }
726
+ #controls {
727
+ margin-left: 0px;
728
+ }
729
+ }
730
+
731
+ .main-plot-container svg {
732
+ background: transparent !important;
733
+ }
734
+
735
+ .large-image-background-transparent {
736
+ margin-left: 0px;
737
+ margin-right: 0px;
738
+ }
739
+
740
+ /* Import transformers-specific styles */
741
+ @import url('./transformers-custom.css');