Spaces:
Running
Running
| import csv | |
| import os | |
| import sys | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| from lemmatizer import Lemmatizer | |
| #csv.field_size_limit(sys.maxsize) | |
| csv.field_size_limit(csv.field_size_limit(2**31 - 1) ) | |
| def load_readme(): | |
| """Load README.md content and strip YAML frontmatter.""" | |
| readme_path = os.path.join(os.path.dirname(__file__), "README.md") | |
| with open(readme_path, "r", encoding="utf-8") as file: | |
| content = file.read() | |
| # Strip YAML frontmatter (content between --- markers) | |
| if content.startswith("---"): | |
| # Find the second occurrence of --- | |
| lines = content.split("\n") | |
| frontmatter_end = None | |
| for index, line in enumerate(lines[1:], start=1): | |
| if line.strip() == "---": | |
| frontmatter_end = index | |
| break | |
| if frontmatter_end is not None: | |
| # Return content after frontmatter, skipping the blank line if present | |
| content = "\n".join(lines[frontmatter_end + 1:]).lstrip("\n") | |
| return content | |
| readme_content = load_readme() | |
| if gr.NO_RELOAD: | |
| lemmatizer = Lemmatizer(leanred_et=False) | |
| def process_text(text): | |
| doc = lemmatizer(text) | |
| idiom_scores = doc.idiom_scores | |
| detected_idiom = doc.idiom.value | |
| # Create a list to store token analyses | |
| token_analyses = [] | |
| for token in doc.tokens: | |
| token_info = { | |
| "token": token.text, | |
| "lemmas": {} | |
| } | |
| for lemma, analyses in token.lemmas.items(): | |
| # Initialize lemma entry | |
| if lemma.text not in token_info["lemmas"]: | |
| token_info["lemmas"][lemma.text] = { | |
| "analyses": [], | |
| "translations": [] | |
| } | |
| # Collect analyses | |
| for analysis in analyses: | |
| try: | |
| analysis_str = str(analysis) | |
| except AttributeError: | |
| analysis_str = "-" | |
| token_info["lemmas"][lemma.text]["analyses"].append(analysis_str) | |
| # Collect lemma-specific translation | |
| if getattr(lemma, "translation_de", None) and lemma.translation_de != "null": | |
| token_info["lemmas"][lemma.text]["translations"].append(lemma.translation_de) | |
| token_analyses.append(token_info) | |
| # Create DataFrame for token analysis | |
| df_tokens = pd.DataFrame([ | |
| { | |
| "Token": t["token"], | |
| "Lemma": "<br>".join([f"<b>{lemma}</b>" for lemma in t["lemmas"].keys()]), | |
| "German translations": "<br>".join([ | |
| f"<b>{lemma}</b>:\n" + | |
| "<br>".join([ | |
| f"<span style='font-style: italic; color: #B0B0B0;'>{tr}</span>" | |
| for tr in sorted( | |
| lem_data["translations"], | |
| key=lambda x: (len(x), x.lower()) | |
| )[:10] # limit to 10 translations per lemma | |
| ]) | |
| for lemma, lem_data in t["lemmas"].items() if lem_data["translations"] | |
| ]), | |
| "Morphological Analysis": "<br>".join([ | |
| f"<b>{lemma}</b>: " + | |
| "<br>".join(sorted(set(lem_data["analyses"]))) | |
| for lemma, lem_data in t["lemmas"].items() if lem_data["analyses"] | |
| ]) | |
| } | |
| for t in token_analyses | |
| ]) | |
| # Create bar chart data for idiom scores using plotly | |
| # Define idiom display names and order | |
| idiom_map = { | |
| "rm-rumgr": "Rumantsch Grischun", | |
| "rm-sursilv": "Sursilvan", | |
| "rm-sutsilv": "Sutsilvan", | |
| "rm-surmiran": "Surmiran", | |
| "rm-puter": "Puter", | |
| "rm-vallader": "Vallader", | |
| } | |
| # Create ordered list of idioms (reversed for display since the chart plots from bottom to top) | |
| ordered_idioms = ["rm-vallader", "rm-puter", "rm-surmiran", "rm-sutsilv", "rm-sursilv", "rm-rumgr"] | |
| # Create ordered data for the chart | |
| ordered_data = [] | |
| for idiom_code in ordered_idioms: | |
| # Find the corresponding Idiom enum value in the keys | |
| matching_idioms = [i for i in idiom_scores.keys() if i.value == idiom_code] | |
| if matching_idioms: | |
| score = idiom_scores[matching_idioms[0]] | |
| ordered_data.append({ | |
| "idiom_code": idiom_code, | |
| "idiom_name": idiom_map[idiom_code], | |
| "score": round(score * 100, 1) | |
| }) | |
| # Extract values for plotting | |
| idiom_display_names = [item["idiom_name"] for item in ordered_data] | |
| score_values = [item["score"] for item in ordered_data] | |
| idiom_codes = [item["idiom_code"] for item in ordered_data] | |
| # Set colors based on detected idiom | |
| colors = ["#3062FF" if code == detected_idiom else "#BDC9E8" for code in idiom_codes] | |
| fig = go.Figure(data=[ | |
| go.Bar( | |
| y=idiom_display_names, # Use display names for idioms | |
| x=score_values, | |
| marker_color=colors, | |
| orientation='h', # Set horizontal orientation | |
| width=0.4 # Make bars narrower (height in horizontal orientation) | |
| ) | |
| ]) | |
| fig.update_layout( | |
| height=400, | |
| plot_bgcolor='#FAFAFA', | |
| paper_bgcolor='#FAFAFA', | |
| xaxis=dict( | |
| title="(Number of words found in Pledari Grond)", | |
| title_font=dict( | |
| family='"IBM Plex Mono", ui-monospace, Consolas, monospace', | |
| color='rgb(39, 39, 42)', | |
| size=12 | |
| ), | |
| tickformat='.1f', # Format tick labels with 1 decimal place | |
| ticksuffix='%', # Add % suffix to tick labels | |
| tickfont=dict( | |
| family='"IBM Plex Mono", ui-monospace, Consolas, monospace', | |
| color='rgb(39, 39, 42)' | |
| ) | |
| ), | |
| yaxis=dict( | |
| ticksuffix=' ', # Add space between idiom labels and bars | |
| tickfont=dict( | |
| family='"IBM Plex Mono", ui-monospace, Consolas, monospace', | |
| color='rgb(39, 39, 42)' | |
| ) | |
| ), | |
| font=dict( | |
| family='"IBM Plex Mono", ui-monospace, Consolas, monospace', | |
| color='rgb(39, 39, 42)' | |
| ), | |
| ) | |
| # Update hover template to show percentages | |
| fig.update_traces( | |
| hovertemplate='%{y}: %{x:.1f}%<extra></extra>' | |
| ) | |
| # No need to return detected idiom anymore | |
| return fig, df_tokens | |
| with gr.Blocks( | |
| title="Lemmatizer", | |
| css=""" | |
| /* ===== Table Styling ===== */ | |
| #full-width-table .wrap.svelte-drum8y, | |
| #full-width-table table { | |
| width: 100% !important; | |
| table-layout: auto !important; | |
| } | |
| #full-width-table td, | |
| #full-width-table th { | |
| white-space: nowrap !important; | |
| } | |
| /* === Specific column width adjustments === */ | |
| #full-width-table table th:nth-child(1), | |
| #full-width-table table td:nth-child(1) { | |
| min-width: 200px !important; /* Word column */ | |
| } | |
| #full-width-table table th:nth-child(2), | |
| #full-width-table table td:nth-child(2) { | |
| min-width: 200px !important; /* Lemma column */ | |
| } | |
| #full-width-table table th:nth-child(3), | |
| #full-width-table table td:nth-child(3) { | |
| min-width: 200px !important; /* German translations column */ | |
| } | |
| #full-width-table table th:nth-child(4), | |
| #full-width-table table td:nth-child(4) { | |
| min-width: 300px !important; /* Morphological Analysis column */ | |
| } | |
| /* ===== Input box height control ===== */ | |
| #input-box { | |
| display: flex !important; | |
| flex-direction: column !important; | |
| height: 360px !important; /* visually matches plot height ~400px */ | |
| overflow: hidden !important; | |
| } | |
| #input-box textarea { | |
| flex-grow: 1 !important; | |
| height: 100% !important; | |
| max-height: 100% !important; | |
| overflow-y: auto !important; | |
| resize: none !important; | |
| } | |
| """ | |
| ) as demo: | |
| gr.Markdown( | |
| "# Romansh Lemmatizer" | |
| "<sup style='color:#FF5252;font-size:0.4em;vertical-align:super'>(BETA)</sup>" | |
| ) | |
| with gr.Accordion("About", open=False): | |
| gr.Markdown(readme_content) | |
| # === Top Row: Input & Chart === | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| text_input = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter Romansh text here...", | |
| value="La vulp era puspè ina giada fomentada.", | |
| lines=5 | |
| ) | |
| submit_btn = gr.Button("Analyze") | |
| with gr.Column(scale=2): | |
| idiom_chart = gr.Plot(label="Detected Idioms") | |
| # === Bottom Row: Full-width Table === | |
| token_table = gr.DataFrame( | |
| label="Analysis of Words", | |
| datatype="markdown", | |
| wrap=False, # prevent Gradio from wrapping text | |
| elem_id="full-width-table" | |
| ) | |
| # === Function Hook === | |
| submit_btn.click( | |
| fn=process_text, | |
| inputs=[text_input], | |
| outputs=[idiom_chart, token_table] | |
| ) | |
| # Add examples from TSV file | |
| # Read examples from the TSV file | |
| tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv") | |
| # Read the TSV file into a pandas DataFrame | |
| df = pd.read_csv(tsv_path, sep='\t') | |
| # Create a list of examples with their idiom labels | |
| examples_data = [] | |
| for col in df.columns: | |
| for sentence in df[col].dropna(): | |
| if sentence.strip(): # Skip empty sentences | |
| examples_data.append((sentence, col)) | |
| # Create the Examples component with idiom labels and sentence content | |
| examples = [sentence for sentence, _ in examples_data] | |
| example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data] | |
| gr.Examples( | |
| examples=examples, | |
| inputs=text_input, | |
| label="Example Sentences", | |
| example_labels=example_labels, | |
| examples_per_page=100, | |
| fn=process_text, | |
| outputs=[idiom_chart, token_table], | |
| run_on_click=True, | |
| cache_examples=True, | |
| cache_mode='eager', | |
| preload=0, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |