Spaces:

ZurichNLP
/

romansh-lemmatizer

Running

File size: 10,228 Bytes

import csv
import os
import sys

import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from lemmatizer import Lemmatizer

#csv.field_size_limit(sys.maxsize)
csv.field_size_limit(csv.field_size_limit(2**31 - 1) )


def load_readme():
    """Load README.md content and strip YAML frontmatter."""
    readme_path = os.path.join(os.path.dirname(__file__), "README.md")
    with open(readme_path, "r", encoding="utf-8") as file:
        content = file.read()
    
    # Strip YAML frontmatter (content between --- markers)
    if content.startswith("---"):
        # Find the second occurrence of ---
        lines = content.split("\n")
        frontmatter_end = None
        for index, line in enumerate(lines[1:], start=1):
            if line.strip() == "---":
                frontmatter_end = index
                break
        
        if frontmatter_end is not None:
            # Return content after frontmatter, skipping the blank line if present
            content = "\n".join(lines[frontmatter_end + 1:]).lstrip("\n")
    
    return content


readme_content = load_readme()

if gr.NO_RELOAD:
    lemmatizer = Lemmatizer(leanred_et=False)

def process_text(text):
    doc = lemmatizer(text)
    
    idiom_scores = doc.idiom_scores
    detected_idiom = doc.idiom.value
    
    # Create a list to store token analyses
    token_analyses = []

    for token in doc.tokens:
        token_info = {
            "token": token.text,
            "lemmas": {}
        }

        for lemma, analyses in token.lemmas.items():
            # Initialize lemma entry
            if lemma.text not in token_info["lemmas"]:
                token_info["lemmas"][lemma.text] = {
                    "analyses": [],
                    "translations": []
                }

            # Collect analyses
            for analysis in analyses:
                try:
                    analysis_str = str(analysis)
                except AttributeError:
                    analysis_str = "-"
                token_info["lemmas"][lemma.text]["analyses"].append(analysis_str)

            # Collect lemma-specific translation
            if getattr(lemma, "translation_de", None) and lemma.translation_de != "null":
                token_info["lemmas"][lemma.text]["translations"].append(lemma.translation_de)

        token_analyses.append(token_info)

    
    # Create DataFrame for token analysis
    df_tokens = pd.DataFrame([
        {
            "Token": t["token"],
            "Lemma": "<br>".join([f"<b>{lemma}</b>" for lemma in t["lemmas"].keys()]),
            "German translations": "<br>".join([
                f"<b>{lemma}</b>:\n" +
                "<br>".join([
                    f"<span style='font-style: italic; color: #B0B0B0;'>{tr}</span>"
                    for tr in sorted(
                        lem_data["translations"], 
                        key=lambda x: (len(x), x.lower())
                    )[:10]  # limit to 10 translations per lemma
                ])
                for lemma, lem_data in t["lemmas"].items() if lem_data["translations"]
            ]),
            "Morphological Analysis": "<br>".join([
                f"<b>{lemma}</b>: " +
                "<br>".join(sorted(set(lem_data["analyses"])))
                for lemma, lem_data in t["lemmas"].items() if lem_data["analyses"]
            ])
        }
        for t in token_analyses
    ])


    
    # Create bar chart data for idiom scores using plotly
    
    # Define idiom display names and order
    idiom_map = {
        "rm-rumgr": "Rumantsch Grischun",
        "rm-sursilv": "Sursilvan",
        "rm-sutsilv": "Sutsilvan",
        "rm-surmiran": "Surmiran",
        "rm-puter": "Puter",
        "rm-vallader": "Vallader",
    }
    
    # Create ordered list of idioms (reversed for display since the chart plots from bottom to top)
    ordered_idioms = ["rm-vallader", "rm-puter", "rm-surmiran", "rm-sutsilv", "rm-sursilv", "rm-rumgr"]
    
    # Create ordered data for the chart
    ordered_data = []
    for idiom_code in ordered_idioms:
        # Find the corresponding Idiom enum value in the keys
        matching_idioms = [i for i in idiom_scores.keys() if i.value == idiom_code]
        if matching_idioms:
            score = idiom_scores[matching_idioms[0]]
            ordered_data.append({
                "idiom_code": idiom_code,
                "idiom_name": idiom_map[idiom_code],
                "score": round(score * 100, 1)
            })
    
    # Extract values for plotting
    idiom_display_names = [item["idiom_name"] for item in ordered_data]
    score_values = [item["score"] for item in ordered_data]
    idiom_codes = [item["idiom_code"] for item in ordered_data]
    
    # Set colors based on detected idiom
    colors = ["#3062FF" if code == detected_idiom else "#BDC9E8" for code in idiom_codes]
    
    fig = go.Figure(data=[
        go.Bar(
            y=idiom_display_names,  # Use display names for idioms
            x=score_values,
            marker_color=colors,
            orientation='h',  # Set horizontal orientation
            width=0.4  # Make bars narrower (height in horizontal orientation)
        )
    ])
    
    fig.update_layout(
        height=400,
        plot_bgcolor='#FAFAFA',
        paper_bgcolor='#FAFAFA',
        xaxis=dict(
            title="(Number of words found in Pledari Grond)",
            title_font=dict(
                family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
                color='rgb(39, 39, 42)',
                size=12
            ),
            tickformat='.1f',  # Format tick labels with 1 decimal place
            ticksuffix='%',     # Add % suffix to tick labels
            tickfont=dict(
                family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
                color='rgb(39, 39, 42)'
            )
        ),
        yaxis=dict(
            ticksuffix=' ',   # Add space between idiom labels and bars
            tickfont=dict(
                family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
                color='rgb(39, 39, 42)'
            )
        ),
        font=dict(
            family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
            color='rgb(39, 39, 42)'
        ),
    )
    
    # Update hover template to show percentages
    fig.update_traces(
        hovertemplate='%{y}: %{x:.1f}%<extra></extra>'
    )
    
    # No need to return detected idiom anymore
    return fig, df_tokens

with gr.Blocks(
    title="Lemmatizer",
    css="""
    /* ===== Table Styling ===== */
    #full-width-table .wrap.svelte-drum8y, 
    #full-width-table table {
        width: 100% !important;
        table-layout: auto !important;
    }

    #full-width-table td, 
    #full-width-table th {
        white-space: nowrap !important;
    }

    /* === Specific column width adjustments === */
    #full-width-table table th:nth-child(1),
    #full-width-table table td:nth-child(1) {
        min-width: 200px !important; /* Word column */
    }

    #full-width-table table th:nth-child(2),
    #full-width-table table td:nth-child(2) {
        min-width: 200px !important; /* Lemma column */
    }

    #full-width-table table th:nth-child(3),
    #full-width-table table td:nth-child(3) {
        min-width: 200px !important; /* German translations column */
    }

    #full-width-table table th:nth-child(4),
    #full-width-table table td:nth-child(4) {
        min-width: 300px !important; /* Morphological Analysis column */
    }

    /* ===== Input box height control ===== */
    #input-box {
        display: flex !important;
        flex-direction: column !important;
        height: 360px !important; /* visually matches plot height ~400px */
        overflow: hidden !important;
    }

    #input-box textarea {
        flex-grow: 1 !important;
        height: 100% !important;
        max-height: 100% !important;
        overflow-y: auto !important;
        resize: none !important;
    }

    """
) as demo:


    gr.Markdown(
        "# Romansh Lemmatizer"
        "<sup style='color:#FF5252;font-size:0.4em;vertical-align:super'>(BETA)</sup>"
    )

    with gr.Accordion("About", open=False):
        gr.Markdown(readme_content)

    # === Top Row: Input & Chart ===
    with gr.Row():
        with gr.Column(scale=1):
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="Enter Romansh text here...",
                value="La vulp era puspè ina giada fomentada.",
                lines=5
            )
            submit_btn = gr.Button("Analyze")

        with gr.Column(scale=2):
            idiom_chart = gr.Plot(label="Detected Idioms")

    # === Bottom Row: Full-width Table ===
    token_table = gr.DataFrame(
        label="Analysis of Words",
        datatype="markdown",
        wrap=False,  # prevent Gradio from wrapping text
        elem_id="full-width-table"
    )

    # === Function Hook ===
    submit_btn.click(
        fn=process_text,
        inputs=[text_input],
        outputs=[idiom_chart, token_table]
    )

    
    # Add examples from TSV file
    # Read examples from the TSV file
    tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv")
    # Read the TSV file into a pandas DataFrame
    df = pd.read_csv(tsv_path, sep='\t')

    # Create a list of examples with their idiom labels
    examples_data = []
    for col in df.columns:
        for sentence in df[col].dropna():
            if sentence.strip():  # Skip empty sentences
                examples_data.append((sentence, col))

    # Create the Examples component with idiom labels and sentence content
    examples = [sentence for sentence, _ in examples_data]
    example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data]

    gr.Examples(
        examples=examples,
        inputs=text_input,
        label="Example Sentences",
        example_labels=example_labels,
        examples_per_page=100,
        fn=process_text,
        outputs=[idiom_chart, token_table],
        run_on_click=True,
        cache_examples=True,
        cache_mode='eager',
        preload=0,
    )


if __name__ == "__main__":
    demo.launch()