import csv import os import sys import gradio as gr import pandas as pd import plotly.graph_objects as go from lemmatizer import Lemmatizer #csv.field_size_limit(sys.maxsize) csv.field_size_limit(csv.field_size_limit(2**31 - 1) ) def load_readme(): """Load README.md content and strip YAML frontmatter.""" readme_path = os.path.join(os.path.dirname(__file__), "README.md") with open(readme_path, "r", encoding="utf-8") as file: content = file.read() # Strip YAML frontmatter (content between --- markers) if content.startswith("---"): # Find the second occurrence of --- lines = content.split("\n") frontmatter_end = None for index, line in enumerate(lines[1:], start=1): if line.strip() == "---": frontmatter_end = index break if frontmatter_end is not None: # Return content after frontmatter, skipping the blank line if present content = "\n".join(lines[frontmatter_end + 1:]).lstrip("\n") return content readme_content = load_readme() if gr.NO_RELOAD: lemmatizer = Lemmatizer(leanred_et=False) def process_text(text): doc = lemmatizer(text) idiom_scores = doc.idiom_scores detected_idiom = doc.idiom.value # Create a list to store token analyses token_analyses = [] for token in doc.tokens: token_info = { "token": token.text, "lemmas": {} } for lemma, analyses in token.lemmas.items(): # Initialize lemma entry if lemma.text not in token_info["lemmas"]: token_info["lemmas"][lemma.text] = { "analyses": [], "translations": [] } # Collect analyses for analysis in analyses: try: analysis_str = str(analysis) except AttributeError: analysis_str = "-" token_info["lemmas"][lemma.text]["analyses"].append(analysis_str) # Collect lemma-specific translation if getattr(lemma, "translation_de", None) and lemma.translation_de != "null": token_info["lemmas"][lemma.text]["translations"].append(lemma.translation_de) token_analyses.append(token_info) # Create DataFrame for token analysis df_tokens = pd.DataFrame([ { "Token": t["token"], "Lemma": "
".join([f"{lemma}" for lemma in t["lemmas"].keys()]), "German translations": "
".join([ f"{lemma}:\n" + "
".join([ f"{tr}" for tr in sorted( lem_data["translations"], key=lambda x: (len(x), x.lower()) )[:10] # limit to 10 translations per lemma ]) for lemma, lem_data in t["lemmas"].items() if lem_data["translations"] ]), "Morphological Analysis": "
".join([ f"{lemma}: " + "
".join(sorted(set(lem_data["analyses"]))) for lemma, lem_data in t["lemmas"].items() if lem_data["analyses"] ]) } for t in token_analyses ]) # Create bar chart data for idiom scores using plotly # Define idiom display names and order idiom_map = { "rm-rumgr": "Rumantsch Grischun", "rm-sursilv": "Sursilvan", "rm-sutsilv": "Sutsilvan", "rm-surmiran": "Surmiran", "rm-puter": "Puter", "rm-vallader": "Vallader", } # Create ordered list of idioms (reversed for display since the chart plots from bottom to top) ordered_idioms = ["rm-vallader", "rm-puter", "rm-surmiran", "rm-sutsilv", "rm-sursilv", "rm-rumgr"] # Create ordered data for the chart ordered_data = [] for idiom_code in ordered_idioms: # Find the corresponding Idiom enum value in the keys matching_idioms = [i for i in idiom_scores.keys() if i.value == idiom_code] if matching_idioms: score = idiom_scores[matching_idioms[0]] ordered_data.append({ "idiom_code": idiom_code, "idiom_name": idiom_map[idiom_code], "score": round(score * 100, 1) }) # Extract values for plotting idiom_display_names = [item["idiom_name"] for item in ordered_data] score_values = [item["score"] for item in ordered_data] idiom_codes = [item["idiom_code"] for item in ordered_data] # Set colors based on detected idiom colors = ["#3062FF" if code == detected_idiom else "#BDC9E8" for code in idiom_codes] fig = go.Figure(data=[ go.Bar( y=idiom_display_names, # Use display names for idioms x=score_values, marker_color=colors, orientation='h', # Set horizontal orientation width=0.4 # Make bars narrower (height in horizontal orientation) ) ]) fig.update_layout( height=400, plot_bgcolor='#FAFAFA', paper_bgcolor='#FAFAFA', xaxis=dict( title="(Number of words found in Pledari Grond)", title_font=dict( family='"IBM Plex Mono", ui-monospace, Consolas, monospace', color='rgb(39, 39, 42)', size=12 ), tickformat='.1f', # Format tick labels with 1 decimal place ticksuffix='%', # Add % suffix to tick labels tickfont=dict( family='"IBM Plex Mono", ui-monospace, Consolas, monospace', color='rgb(39, 39, 42)' ) ), yaxis=dict( ticksuffix=' ', # Add space between idiom labels and bars tickfont=dict( family='"IBM Plex Mono", ui-monospace, Consolas, monospace', color='rgb(39, 39, 42)' ) ), font=dict( family='"IBM Plex Mono", ui-monospace, Consolas, monospace', color='rgb(39, 39, 42)' ), ) # Update hover template to show percentages fig.update_traces( hovertemplate='%{y}: %{x:.1f}%' ) # No need to return detected idiom anymore return fig, df_tokens with gr.Blocks( title="Lemmatizer", css=""" /* ===== Table Styling ===== */ #full-width-table .wrap.svelte-drum8y, #full-width-table table { width: 100% !important; table-layout: auto !important; } #full-width-table td, #full-width-table th { white-space: nowrap !important; } /* === Specific column width adjustments === */ #full-width-table table th:nth-child(1), #full-width-table table td:nth-child(1) { min-width: 200px !important; /* Word column */ } #full-width-table table th:nth-child(2), #full-width-table table td:nth-child(2) { min-width: 200px !important; /* Lemma column */ } #full-width-table table th:nth-child(3), #full-width-table table td:nth-child(3) { min-width: 200px !important; /* German translations column */ } #full-width-table table th:nth-child(4), #full-width-table table td:nth-child(4) { min-width: 300px !important; /* Morphological Analysis column */ } /* ===== Input box height control ===== */ #input-box { display: flex !important; flex-direction: column !important; height: 360px !important; /* visually matches plot height ~400px */ overflow: hidden !important; } #input-box textarea { flex-grow: 1 !important; height: 100% !important; max-height: 100% !important; overflow-y: auto !important; resize: none !important; } """ ) as demo: gr.Markdown( "# Romansh Lemmatizer" "(BETA)" ) with gr.Accordion("About", open=False): gr.Markdown(readme_content) # === Top Row: Input & Chart === with gr.Row(): with gr.Column(scale=1): text_input = gr.Textbox( label="Input Text", placeholder="Enter Romansh text here...", value="La vulp era puspè ina giada fomentada.", lines=5 ) submit_btn = gr.Button("Analyze") with gr.Column(scale=2): idiom_chart = gr.Plot(label="Detected Idioms") # === Bottom Row: Full-width Table === token_table = gr.DataFrame( label="Analysis of Words", datatype="markdown", wrap=False, # prevent Gradio from wrapping text elem_id="full-width-table" ) # === Function Hook === submit_btn.click( fn=process_text, inputs=[text_input], outputs=[idiom_chart, token_table] ) # Add examples from TSV file # Read examples from the TSV file tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv") # Read the TSV file into a pandas DataFrame df = pd.read_csv(tsv_path, sep='\t') # Create a list of examples with their idiom labels examples_data = [] for col in df.columns: for sentence in df[col].dropna(): if sentence.strip(): # Skip empty sentences examples_data.append((sentence, col)) # Create the Examples component with idiom labels and sentence content examples = [sentence for sentence, _ in examples_data] example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data] gr.Examples( examples=examples, inputs=text_input, label="Example Sentences", example_labels=example_labels, examples_per_page=100, fn=process_text, outputs=[idiom_chart, token_table], run_on_click=True, cache_examples=True, cache_mode='eager', preload=0, ) if __name__ == "__main__": demo.launch()