Spaces:

LLM360
/

TxT360

Running

App Files Files Community

victormiller commited on Oct 1, 2024

Commit

e93fc1a

verified ·

1 Parent(s): 858c4bf

Update curated.py

Browse files

Files changed (1) hide show

curated.py +49 -0

curated.py CHANGED Viewed

@@ -808,6 +808,54 @@ fig.update_layout(
 # Show the plot
 stacked_bar = fig
 def curated(request):
@@ -943,6 +991,7 @@ def curated(request):
             plotly2fasthtml(get_chart_28168342()),
             plotly2fasthtml(get_chart_new()),
             plotly2fasthtml(stacked_bar),
             H2("Curated Sources Processing"),
             filtering_process,
             data_preparation_div,

 # Show the plot
 stacked_bar = fig
+# Aggregating the data for filters and datasets
+filter_data = {
+    'Filter': ['Language Filter', 'Min Word Count', 'Unigram log probability', 'Total Lines Remaining'],
+    'Wikipedia': [0, 1146416, 60468491, 60468491],
+    'Freelaw': [2280522, 5518932, 68171834, 68123174],
+    'DM Maths': [0, 0, 112559888, 112559888],
+    'USPTO': [1312, 129042, 6749922, 6749389],
+    'PG19': [69, 1, 28682, 28632],
+    'Hackernews': [54129, 314, 2010488, 2003636],
+    'Ubuntu IRC': [14465, 33, 23468, 23205],
+    'Europarl': [0, 0, 69814, 69814],
+    'StackExchange': [0, 196, 23246352, 23246352],
+    'Arxiv': [42426, 105601, 1763840, 1762661],
+    'S2ORC': [0, 0, 12963563, 12963563],
+    'S2ORC Abstract': [18456575, 978308, 82889293, 82777912],
+    'Pubmed Central': [400446, 62176, 4768310, 4767474],
+    'Pubmed Abstract': [3100, 36419, 25747955, 25746724],
+    'Phil Papers': [10214, 0, 39175, 39128]
+}
+# Creating a new dataframe for the filter data
+filter_df = pd.DataFrame(filter_data)
+# Creating the stacked bar chart
+fig = go.Figure()
+# Add trace for each dataset
+for dataset in filter_df.columns[1:]:
+    fig.add_trace(go.Bar(
+        name=dataset,
+        x=filter_df['Filter'],
+        y=filter_df[dataset]
+    ))
+# Update the layout
+fig.update_layout(
+    barmode='stack',
+    title='Stacked Bar Chart of Filters for Each Dataset',
+    xaxis_title='Filter',
+    yaxis_title='Number of Lines',
+    legend_title='Dataset',
+    height=600,
+    width=1000
+)
+# Show the plot
+diff_stacked_bar = fig
 def curated(request):
             plotly2fasthtml(get_chart_28168342()),
             plotly2fasthtml(get_chart_new()),
             plotly2fasthtml(stacked_bar),
+            plotly2fasthtml(diff_stacked_bar),
             H2("Curated Sources Processing"),
             filtering_process,
             data_preparation_div,