Spaces:
				
			
			
	
			
			
		Runtime error
		
	
	
	
			
			
	
	
	
	
		
		
		Runtime error
		
	Add barchart showing shortest/longest median tokenized text
Browse files
    	
        app.py
    CHANGED
    
    | @@ -9,6 +9,8 @@ import seaborn as sns | |
| 9 | 
             
            import numpy as np
         | 
| 10 | 
             
            import plotly.figure_factory as ff
         | 
| 11 | 
             
            import plotly.express as px
         | 
|  | |
|  | |
| 12 | 
             
            import random, glob
         | 
| 13 |  | 
| 14 | 
             
            @st.cache_data
         | 
| @@ -51,7 +53,9 @@ tokenizer_names_to_test = [ | |
| 51 | 
             
            with st.sidebar:
         | 
| 52 |  | 
| 53 | 
             
            	st.header('All languages are NOT created (tokenized) equal!')
         | 
| 54 | 
            -
            	link="This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). | 
|  | |
|  | |
| 55 | 
             
            	st.markdown(link)
         | 
| 56 |  | 
| 57 | 
             
            	st.header('Data Visualization')
         | 
| @@ -130,7 +134,33 @@ with st.container(): | |
| 130 | 
             
            	) 
         | 
| 131 | 
             
            	st.plotly_chart(fig, use_container_width=True)
         | 
| 132 |  | 
| 133 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 134 |  | 
| 135 |  | 
| 136 |  | 
|  | |
| 9 | 
             
            import numpy as np
         | 
| 10 | 
             
            import plotly.figure_factory as ff
         | 
| 11 | 
             
            import plotly.express as px
         | 
| 12 | 
            +
            from plotly.subplots import make_subplots
         | 
| 13 | 
            +
            import plotly.graph_objects as go
         | 
| 14 | 
             
            import random, glob
         | 
| 15 |  | 
| 16 | 
             
            @st.cache_data
         | 
|  | |
| 53 | 
             
            with st.sidebar:
         | 
| 54 |  | 
| 55 | 
             
            	st.header('All languages are NOT created (tokenized) equal!')
         | 
| 56 | 
            +
            	link="This project compares the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese)."
         | 
| 57 | 
            +
            	st.markdown(link)
         | 
| 58 | 
            +
            	link="This is part of a larger project of measuring inequality in NLP. See the original article: [All languages are NOT created (tokenized) equal](https://www.artfish.ai/p/all-languages-are-not-created-tokenized) on [Art Fish Intelligence](https://www.artfish.ai/)."
         | 
| 59 | 
             
            	st.markdown(link)
         | 
| 60 |  | 
| 61 | 
             
            	st.header('Data Visualization')
         | 
|  | |
| 134 | 
             
            	) 
         | 
| 135 | 
             
            	st.plotly_chart(fig, use_container_width=True)
         | 
| 136 |  | 
| 137 | 
            +
             | 
| 138 | 
            +
            	# Create figures using px.bar
         | 
| 139 | 
            +
            	shortest = val_data.groupby('lang')[tokenizer_name].median().sort_values().head(7).reset_index()
         | 
| 140 | 
            +
            	shortest["type"] = "shortest"
         | 
| 141 | 
            +
            	longest = val_data.groupby('lang')[tokenizer_name].median().sort_values().tail(7).reset_index()
         | 
| 142 | 
            +
            	longest["type"] = "longest"
         | 
| 143 | 
            +
            	combined = pd.concat([shortest, longest]).reset_index(drop=True).sort_values(by=tokenizer_name, ascending=False)
         | 
| 144 | 
            +
            	color_sequence = px.colors.qualitative.D3  # You can choose other built-in sequences or define your own
         | 
| 145 | 
            +
            	fig = px.bar(combined, x=tokenizer_name, y="lang", orientation='h', color='type', color_discrete_sequence=color_sequence)
         | 
| 146 | 
            +
            	fig.update_traces(hovertemplate='%{y}: %{x} tokens')
         | 
| 147 | 
            +
            	fig.update_layout(
         | 
| 148 | 
            +
            			title=dict(text='Top Langs with Shortest and Longest Median Token Lengths',
         | 
| 149 | 
            +
            			 font=dict(size=25), automargin=True, yref='paper', pad=dict(b=20)),  # Add more padding below the title
         | 
| 150 | 
            +
            			# title='Distribution of tokens',
         | 
| 151 | 
            +
            	    xaxis=dict(
         | 
| 152 | 
            +
                    title="Number of Tokens",
         | 
| 153 | 
            +
                    showgrid=True,   # Show vertical gridlines
         | 
| 154 | 
            +
                    gridwidth=1,     # Gridline width
         | 
| 155 | 
            +
                    gridcolor='LightGrey'  # Gridline color
         | 
| 156 | 
            +
            	    ),
         | 
| 157 | 
            +
            	    yaxis=dict(
         | 
| 158 | 
            +
            	        title="",
         | 
| 159 | 
            +
            	    ),
         | 
| 160 | 
            +
            	    height=400,
         | 
| 161 | 
            +
            	    showlegend=False  # Remove the legend
         | 
| 162 | 
            +
            		) 
         | 
| 163 | 
            +
            	st.plotly_chart(fig, use_container_width=True)
         | 
| 164 |  | 
| 165 |  | 
| 166 |  |