Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -50,12 +50,12 @@ def generate_plot(E, A, k, alpha):
|
|
| 50 |
return plt
|
| 51 |
|
| 52 |
|
| 53 |
-
OUTPUT_TEMPLATE = """Loss for a {n}B model when P={p} is: **{loss}**. It is equivalant to:
|
| 54 |
|
| 55 |
-
- A {n1}B model with P=1
|
| 56 |
-
- A {n2}B model with P=2
|
| 57 |
-
- A {n4}B model with P=4
|
| 58 |
-
- A {n8}B model with P=8
|
| 59 |
|
| 60 |
Note: The equivalent parameters are for reference only. In some reasoning tasks, scaling the parallel streams will obtain more performance gains than the loss benefits!
|
| 61 |
|
|
@@ -77,13 +77,12 @@ def process_inputs(E, A, k, alpha, n, p):
|
|
| 77 |
|
| 78 |
# Create interface
|
| 79 |
|
| 80 |
-
HEAD = """
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
$$
|
| 87 |
"""
|
| 88 |
|
| 89 |
with gr.Blocks() as demo:
|
|
@@ -91,6 +90,12 @@ with gr.Blocks() as demo:
|
|
| 91 |
|
| 92 |
with gr.Row():
|
| 93 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
# Input values
|
| 96 |
N = gr.Number(value=2.8, label="N: Number of Non-Embedding Model Parameters (in Billion)")
|
|
@@ -111,11 +116,12 @@ with gr.Blocks() as demo:
|
|
| 111 |
param_k = gr.Number(value=PARAM_SETS["Stack-V2-Python"]['k'], label="k")
|
| 112 |
param_alpha = gr.Number(value=PARAM_SETS["Stack-V2-Python"]['alpha'], label="alpha")
|
| 113 |
|
| 114 |
-
submit_btn = gr.Button("Estimate Loss and Equivalant Model Parameters")
|
| 115 |
|
| 116 |
|
| 117 |
plot, output = process_inputs(PARAM_SETS["Stack-V2-Python"]['E'], PARAM_SETS["Stack-V2-Python"]['A'], PARAM_SETS["Stack-V2-Python"]['k'], PARAM_SETS["Stack-V2-Python"]['alpha'], 2.8, 4)
|
| 118 |
with gr.Column():
|
|
|
|
|
|
|
| 119 |
# Output section
|
| 120 |
plot_output = gr.Plot(label="Scaling Law Curve", value=plot)
|
| 121 |
result_output = gr.Markdown(label="Result", value=output)
|
|
|
|
| 50 |
return plt
|
| 51 |
|
| 52 |
|
| 53 |
+
OUTPUT_TEMPLATE = """Loss for a {n}B model when P={p} is: **{loss:.5f}**. It is equivalant to:
|
| 54 |
|
| 55 |
+
- A **{n1}B** model with **P=1**;
|
| 56 |
+
- A **{n2}B** model with **P=2**;
|
| 57 |
+
- A **{n4}B** model with **P=4**;
|
| 58 |
+
- A **{n8}B** model with **P=8**;
|
| 59 |
|
| 60 |
Note: The equivalent parameters are for reference only. In some reasoning tasks, scaling the parallel streams will obtain more performance gains than the loss benefits!
|
| 61 |
|
|
|
|
| 77 |
|
| 78 |
# Create interface
|
| 79 |
|
| 80 |
+
HEAD = """<div align="center">
|
| 81 |
|
| 82 |
+
# Parallel Scaling Law Visualization
|
| 83 |
+
|
| 84 |
+
[](https://arxiv.org/abs/2505.10475)
|
| 85 |
+
</div>
|
|
|
|
| 86 |
"""
|
| 87 |
|
| 88 |
with gr.Blocks() as demo:
|
|
|
|
| 90 |
|
| 91 |
with gr.Row():
|
| 92 |
with gr.Column():
|
| 93 |
+
|
| 94 |
+
gr.Markdown("""$$
|
| 95 |
+
\\text{Loss}=E+\\left(
|
| 96 |
+
\\frac{A}{\\text{Parameters}\\times (1+k\\log P)}
|
| 97 |
+
\\right)^{\\alpha}
|
| 98 |
+
$$""")
|
| 99 |
|
| 100 |
# Input values
|
| 101 |
N = gr.Number(value=2.8, label="N: Number of Non-Embedding Model Parameters (in Billion)")
|
|
|
|
| 116 |
param_k = gr.Number(value=PARAM_SETS["Stack-V2-Python"]['k'], label="k")
|
| 117 |
param_alpha = gr.Number(value=PARAM_SETS["Stack-V2-Python"]['alpha'], label="alpha")
|
| 118 |
|
|
|
|
| 119 |
|
| 120 |
|
| 121 |
plot, output = process_inputs(PARAM_SETS["Stack-V2-Python"]['E'], PARAM_SETS["Stack-V2-Python"]['A'], PARAM_SETS["Stack-V2-Python"]['k'], PARAM_SETS["Stack-V2-Python"]['alpha'], 2.8, 4)
|
| 122 |
with gr.Column():
|
| 123 |
+
|
| 124 |
+
submit_btn = gr.Button("Calculate")
|
| 125 |
# Output section
|
| 126 |
plot_output = gr.Plot(label="Scaling Law Curve", value=plot)
|
| 127 |
result_output = gr.Markdown(label="Result", value=output)
|