File size: 10,228 Bytes
0890648
6e53ade
0890648
6e53ade
9b7da23
6e53ade
9b7da23
 
 
0e6edda
 
68f24a9
0890648
e4a3a66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b7da23
ad3723a
9b7da23
 
 
 
 
 
 
 
 
8b78467
9b7da23
 
 
8b78467
9b7da23
8b78467
9b7da23
8b78467
9b7da23
8b78467
 
 
 
 
 
9b7da23
 
 
 
 
8b78467
 
 
 
 
 
9b7da23
8b78467
9b7da23
 
 
 
 
 
16634d5
0e6edda
8b78467
43564f9
1cb5c4f
 
 
 
8b78467
1cb5c4f
16634d5
9b7da23
8b78467
 
1cb5c4f
9b7da23
 
 
 
8b78467
1cb5c4f
9b7da23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0e6edda
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d285f5a
 
0e6edda
 
e4a3a66
 
 
0e6edda
9b7da23
 
 
 
 
ad3723a
9b7da23
 
 
0e6edda
9b7da23
 
0e6edda
 
 
 
 
 
 
 
 
 
9b7da23
 
 
 
 
0e6edda
9b7da23
 
 
 
6e53ade
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0890648
 
 
6e53ade
 
9b7da23
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
import csv
import os
import sys

import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from lemmatizer import Lemmatizer

#csv.field_size_limit(sys.maxsize)
csv.field_size_limit(csv.field_size_limit(2**31 - 1) )


def load_readme():
    """Load README.md content and strip YAML frontmatter."""
    readme_path = os.path.join(os.path.dirname(__file__), "README.md")
    with open(readme_path, "r", encoding="utf-8") as file:
        content = file.read()
    
    # Strip YAML frontmatter (content between --- markers)
    if content.startswith("---"):
        # Find the second occurrence of ---
        lines = content.split("\n")
        frontmatter_end = None
        for index, line in enumerate(lines[1:], start=1):
            if line.strip() == "---":
                frontmatter_end = index
                break
        
        if frontmatter_end is not None:
            # Return content after frontmatter, skipping the blank line if present
            content = "\n".join(lines[frontmatter_end + 1:]).lstrip("\n")
    
    return content


readme_content = load_readme()

if gr.NO_RELOAD:
    lemmatizer = Lemmatizer(leanred_et=False)

def process_text(text):
    doc = lemmatizer(text)
    
    idiom_scores = doc.idiom_scores
    detected_idiom = doc.idiom.value
    
    # Create a list to store token analyses
    token_analyses = []

    for token in doc.tokens:
        token_info = {
            "token": token.text,
            "lemmas": {}
        }

        for lemma, analyses in token.lemmas.items():
            # Initialize lemma entry
            if lemma.text not in token_info["lemmas"]:
                token_info["lemmas"][lemma.text] = {
                    "analyses": [],
                    "translations": []
                }

            # Collect analyses
            for analysis in analyses:
                try:
                    analysis_str = str(analysis)
                except AttributeError:
                    analysis_str = "-"
                token_info["lemmas"][lemma.text]["analyses"].append(analysis_str)

            # Collect lemma-specific translation
            if getattr(lemma, "translation_de", None) and lemma.translation_de != "null":
                token_info["lemmas"][lemma.text]["translations"].append(lemma.translation_de)

        token_analyses.append(token_info)

    
    # Create DataFrame for token analysis
    df_tokens = pd.DataFrame([
        {
            "Token": t["token"],
            "Lemma": "<br>".join([f"<b>{lemma}</b>" for lemma in t["lemmas"].keys()]),
            "German translations": "<br>".join([
                f"<b>{lemma}</b>:\n" +
                "<br>".join([
                    f"<span style='font-style: italic; color: #B0B0B0;'>{tr}</span>"
                    for tr in sorted(
                        lem_data["translations"], 
                        key=lambda x: (len(x), x.lower())
                    )[:10]  # limit to 10 translations per lemma
                ])
                for lemma, lem_data in t["lemmas"].items() if lem_data["translations"]
            ]),
            "Morphological Analysis": "<br>".join([
                f"<b>{lemma}</b>: " +
                "<br>".join(sorted(set(lem_data["analyses"])))
                for lemma, lem_data in t["lemmas"].items() if lem_data["analyses"]
            ])
        }
        for t in token_analyses
    ])


    
    # Create bar chart data for idiom scores using plotly
    
    # Define idiom display names and order
    idiom_map = {
        "rm-rumgr": "Rumantsch Grischun",
        "rm-sursilv": "Sursilvan",
        "rm-sutsilv": "Sutsilvan",
        "rm-surmiran": "Surmiran",
        "rm-puter": "Puter",
        "rm-vallader": "Vallader",
    }
    
    # Create ordered list of idioms (reversed for display since the chart plots from bottom to top)
    ordered_idioms = ["rm-vallader", "rm-puter", "rm-surmiran", "rm-sutsilv", "rm-sursilv", "rm-rumgr"]
    
    # Create ordered data for the chart
    ordered_data = []
    for idiom_code in ordered_idioms:
        # Find the corresponding Idiom enum value in the keys
        matching_idioms = [i for i in idiom_scores.keys() if i.value == idiom_code]
        if matching_idioms:
            score = idiom_scores[matching_idioms[0]]
            ordered_data.append({
                "idiom_code": idiom_code,
                "idiom_name": idiom_map[idiom_code],
                "score": round(score * 100, 1)
            })
    
    # Extract values for plotting
    idiom_display_names = [item["idiom_name"] for item in ordered_data]
    score_values = [item["score"] for item in ordered_data]
    idiom_codes = [item["idiom_code"] for item in ordered_data]
    
    # Set colors based on detected idiom
    colors = ["#3062FF" if code == detected_idiom else "#BDC9E8" for code in idiom_codes]
    
    fig = go.Figure(data=[
        go.Bar(
            y=idiom_display_names,  # Use display names for idioms
            x=score_values,
            marker_color=colors,
            orientation='h',  # Set horizontal orientation
            width=0.4  # Make bars narrower (height in horizontal orientation)
        )
    ])
    
    fig.update_layout(
        height=400,
        plot_bgcolor='#FAFAFA',
        paper_bgcolor='#FAFAFA',
        xaxis=dict(
            title="(Number of words found in Pledari Grond)",
            title_font=dict(
                family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
                color='rgb(39, 39, 42)',
                size=12
            ),
            tickformat='.1f',  # Format tick labels with 1 decimal place
            ticksuffix='%',     # Add % suffix to tick labels
            tickfont=dict(
                family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
                color='rgb(39, 39, 42)'
            )
        ),
        yaxis=dict(
            ticksuffix=' ',   # Add space between idiom labels and bars
            tickfont=dict(
                family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
                color='rgb(39, 39, 42)'
            )
        ),
        font=dict(
            family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
            color='rgb(39, 39, 42)'
        ),
    )
    
    # Update hover template to show percentages
    fig.update_traces(
        hovertemplate='%{y}: %{x:.1f}%<extra></extra>'
    )
    
    # No need to return detected idiom anymore
    return fig, df_tokens

with gr.Blocks(
    title="Lemmatizer",
    css="""
    /* ===== Table Styling ===== */
    #full-width-table .wrap.svelte-drum8y, 
    #full-width-table table {
        width: 100% !important;
        table-layout: auto !important;
    }

    #full-width-table td, 
    #full-width-table th {
        white-space: nowrap !important;
    }

    /* === Specific column width adjustments === */
    #full-width-table table th:nth-child(1),
    #full-width-table table td:nth-child(1) {
        min-width: 200px !important; /* Word column */
    }

    #full-width-table table th:nth-child(2),
    #full-width-table table td:nth-child(2) {
        min-width: 200px !important; /* Lemma column */
    }

    #full-width-table table th:nth-child(3),
    #full-width-table table td:nth-child(3) {
        min-width: 200px !important; /* German translations column */
    }

    #full-width-table table th:nth-child(4),
    #full-width-table table td:nth-child(4) {
        min-width: 300px !important; /* Morphological Analysis column */
    }

    /* ===== Input box height control ===== */
    #input-box {
        display: flex !important;
        flex-direction: column !important;
        height: 360px !important; /* visually matches plot height ~400px */
        overflow: hidden !important;
    }

    #input-box textarea {
        flex-grow: 1 !important;
        height: 100% !important;
        max-height: 100% !important;
        overflow-y: auto !important;
        resize: none !important;
    }

    """
) as demo:


    gr.Markdown(
        "# Romansh Lemmatizer"
        "<sup style='color:#FF5252;font-size:0.4em;vertical-align:super'>(BETA)</sup>"
    )

    with gr.Accordion("About", open=False):
        gr.Markdown(readme_content)

    # === Top Row: Input & Chart ===
    with gr.Row():
        with gr.Column(scale=1):
            text_input = gr.Textbox(
                label="Input Text",
                placeholder="Enter Romansh text here...",
                value="La vulp era puspè ina giada fomentada.",
                lines=5
            )
            submit_btn = gr.Button("Analyze")

        with gr.Column(scale=2):
            idiom_chart = gr.Plot(label="Detected Idioms")

    # === Bottom Row: Full-width Table ===
    token_table = gr.DataFrame(
        label="Analysis of Words",
        datatype="markdown",
        wrap=False,  # prevent Gradio from wrapping text
        elem_id="full-width-table"
    )

    # === Function Hook ===
    submit_btn.click(
        fn=process_text,
        inputs=[text_input],
        outputs=[idiom_chart, token_table]
    )

    
    # Add examples from TSV file
    # Read examples from the TSV file
    tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv")
    # Read the TSV file into a pandas DataFrame
    df = pd.read_csv(tsv_path, sep='\t')

    # Create a list of examples with their idiom labels
    examples_data = []
    for col in df.columns:
        for sentence in df[col].dropna():
            if sentence.strip():  # Skip empty sentences
                examples_data.append((sentence, col))

    # Create the Examples component with idiom labels and sentence content
    examples = [sentence for sentence, _ in examples_data]
    example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data]

    gr.Examples(
        examples=examples,
        inputs=text_input,
        label="Example Sentences",
        example_labels=example_labels,
        examples_per_page=100,
        fn=process_text,
        outputs=[idiom_chart, token_table],
        run_on_click=True,
        cache_examples=True,
        cache_mode='eager',
        preload=0,
    )


if __name__ == "__main__":
    demo.launch()