Spaces:
Running
Running
Update curated.py
Browse files- curated.py +49 -0
curated.py
CHANGED
|
@@ -808,6 +808,54 @@ fig.update_layout(
|
|
| 808 |
# Show the plot
|
| 809 |
stacked_bar = fig
|
| 810 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 811 |
|
| 812 |
def curated(request):
|
| 813 |
|
|
@@ -943,6 +991,7 @@ def curated(request):
|
|
| 943 |
plotly2fasthtml(get_chart_28168342()),
|
| 944 |
plotly2fasthtml(get_chart_new()),
|
| 945 |
plotly2fasthtml(stacked_bar),
|
|
|
|
| 946 |
H2("Curated Sources Processing"),
|
| 947 |
filtering_process,
|
| 948 |
data_preparation_div,
|
|
|
|
| 808 |
# Show the plot
|
| 809 |
stacked_bar = fig
|
| 810 |
|
| 811 |
+
# Aggregating the data for filters and datasets
|
| 812 |
+
filter_data = {
|
| 813 |
+
'Filter': ['Language Filter', 'Min Word Count', 'Unigram log probability', 'Total Lines Remaining'],
|
| 814 |
+
'Wikipedia': [0, 1146416, 60468491, 60468491],
|
| 815 |
+
'Freelaw': [2280522, 5518932, 68171834, 68123174],
|
| 816 |
+
'DM Maths': [0, 0, 112559888, 112559888],
|
| 817 |
+
'USPTO': [1312, 129042, 6749922, 6749389],
|
| 818 |
+
'PG19': [69, 1, 28682, 28632],
|
| 819 |
+
'Hackernews': [54129, 314, 2010488, 2003636],
|
| 820 |
+
'Ubuntu IRC': [14465, 33, 23468, 23205],
|
| 821 |
+
'Europarl': [0, 0, 69814, 69814],
|
| 822 |
+
'StackExchange': [0, 196, 23246352, 23246352],
|
| 823 |
+
'Arxiv': [42426, 105601, 1763840, 1762661],
|
| 824 |
+
'S2ORC': [0, 0, 12963563, 12963563],
|
| 825 |
+
'S2ORC Abstract': [18456575, 978308, 82889293, 82777912],
|
| 826 |
+
'Pubmed Central': [400446, 62176, 4768310, 4767474],
|
| 827 |
+
'Pubmed Abstract': [3100, 36419, 25747955, 25746724],
|
| 828 |
+
'Phil Papers': [10214, 0, 39175, 39128]
|
| 829 |
+
}
|
| 830 |
+
|
| 831 |
+
# Creating a new dataframe for the filter data
|
| 832 |
+
filter_df = pd.DataFrame(filter_data)
|
| 833 |
+
|
| 834 |
+
# Creating the stacked bar chart
|
| 835 |
+
fig = go.Figure()
|
| 836 |
+
|
| 837 |
+
# Add trace for each dataset
|
| 838 |
+
for dataset in filter_df.columns[1:]:
|
| 839 |
+
fig.add_trace(go.Bar(
|
| 840 |
+
name=dataset,
|
| 841 |
+
x=filter_df['Filter'],
|
| 842 |
+
y=filter_df[dataset]
|
| 843 |
+
))
|
| 844 |
+
|
| 845 |
+
# Update the layout
|
| 846 |
+
fig.update_layout(
|
| 847 |
+
barmode='stack',
|
| 848 |
+
title='Stacked Bar Chart of Filters for Each Dataset',
|
| 849 |
+
xaxis_title='Filter',
|
| 850 |
+
yaxis_title='Number of Lines',
|
| 851 |
+
legend_title='Dataset',
|
| 852 |
+
height=600,
|
| 853 |
+
width=1000
|
| 854 |
+
)
|
| 855 |
+
|
| 856 |
+
# Show the plot
|
| 857 |
+
diff_stacked_bar = fig
|
| 858 |
+
|
| 859 |
|
| 860 |
def curated(request):
|
| 861 |
|
|
|
|
| 991 |
plotly2fasthtml(get_chart_28168342()),
|
| 992 |
plotly2fasthtml(get_chart_new()),
|
| 993 |
plotly2fasthtml(stacked_bar),
|
| 994 |
+
plotly2fasthtml(diff_stacked_bar),
|
| 995 |
H2("Curated Sources Processing"),
|
| 996 |
filtering_process,
|
| 997 |
data_preparation_div,
|