Spaces:
Running
Running
File size: 10,228 Bytes
0890648 6e53ade 0890648 6e53ade 9b7da23 6e53ade 9b7da23 0e6edda 68f24a9 0890648 e4a3a66 9b7da23 ad3723a 9b7da23 8b78467 9b7da23 8b78467 9b7da23 8b78467 9b7da23 8b78467 9b7da23 8b78467 9b7da23 8b78467 9b7da23 8b78467 9b7da23 16634d5 0e6edda 8b78467 43564f9 1cb5c4f 8b78467 1cb5c4f 16634d5 9b7da23 8b78467 1cb5c4f 9b7da23 8b78467 1cb5c4f 9b7da23 0e6edda d285f5a 0e6edda e4a3a66 0e6edda 9b7da23 ad3723a 9b7da23 0e6edda 9b7da23 0e6edda 9b7da23 0e6edda 9b7da23 6e53ade 0890648 6e53ade 9b7da23 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 |
import csv
import os
import sys
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from lemmatizer import Lemmatizer
#csv.field_size_limit(sys.maxsize)
csv.field_size_limit(csv.field_size_limit(2**31 - 1) )
def load_readme():
"""Load README.md content and strip YAML frontmatter."""
readme_path = os.path.join(os.path.dirname(__file__), "README.md")
with open(readme_path, "r", encoding="utf-8") as file:
content = file.read()
# Strip YAML frontmatter (content between --- markers)
if content.startswith("---"):
# Find the second occurrence of ---
lines = content.split("\n")
frontmatter_end = None
for index, line in enumerate(lines[1:], start=1):
if line.strip() == "---":
frontmatter_end = index
break
if frontmatter_end is not None:
# Return content after frontmatter, skipping the blank line if present
content = "\n".join(lines[frontmatter_end + 1:]).lstrip("\n")
return content
readme_content = load_readme()
if gr.NO_RELOAD:
lemmatizer = Lemmatizer(leanred_et=False)
def process_text(text):
doc = lemmatizer(text)
idiom_scores = doc.idiom_scores
detected_idiom = doc.idiom.value
# Create a list to store token analyses
token_analyses = []
for token in doc.tokens:
token_info = {
"token": token.text,
"lemmas": {}
}
for lemma, analyses in token.lemmas.items():
# Initialize lemma entry
if lemma.text not in token_info["lemmas"]:
token_info["lemmas"][lemma.text] = {
"analyses": [],
"translations": []
}
# Collect analyses
for analysis in analyses:
try:
analysis_str = str(analysis)
except AttributeError:
analysis_str = "-"
token_info["lemmas"][lemma.text]["analyses"].append(analysis_str)
# Collect lemma-specific translation
if getattr(lemma, "translation_de", None) and lemma.translation_de != "null":
token_info["lemmas"][lemma.text]["translations"].append(lemma.translation_de)
token_analyses.append(token_info)
# Create DataFrame for token analysis
df_tokens = pd.DataFrame([
{
"Token": t["token"],
"Lemma": "<br>".join([f"<b>{lemma}</b>" for lemma in t["lemmas"].keys()]),
"German translations": "<br>".join([
f"<b>{lemma}</b>:\n" +
"<br>".join([
f"<span style='font-style: italic; color: #B0B0B0;'>{tr}</span>"
for tr in sorted(
lem_data["translations"],
key=lambda x: (len(x), x.lower())
)[:10] # limit to 10 translations per lemma
])
for lemma, lem_data in t["lemmas"].items() if lem_data["translations"]
]),
"Morphological Analysis": "<br>".join([
f"<b>{lemma}</b>: " +
"<br>".join(sorted(set(lem_data["analyses"])))
for lemma, lem_data in t["lemmas"].items() if lem_data["analyses"]
])
}
for t in token_analyses
])
# Create bar chart data for idiom scores using plotly
# Define idiom display names and order
idiom_map = {
"rm-rumgr": "Rumantsch Grischun",
"rm-sursilv": "Sursilvan",
"rm-sutsilv": "Sutsilvan",
"rm-surmiran": "Surmiran",
"rm-puter": "Puter",
"rm-vallader": "Vallader",
}
# Create ordered list of idioms (reversed for display since the chart plots from bottom to top)
ordered_idioms = ["rm-vallader", "rm-puter", "rm-surmiran", "rm-sutsilv", "rm-sursilv", "rm-rumgr"]
# Create ordered data for the chart
ordered_data = []
for idiom_code in ordered_idioms:
# Find the corresponding Idiom enum value in the keys
matching_idioms = [i for i in idiom_scores.keys() if i.value == idiom_code]
if matching_idioms:
score = idiom_scores[matching_idioms[0]]
ordered_data.append({
"idiom_code": idiom_code,
"idiom_name": idiom_map[idiom_code],
"score": round(score * 100, 1)
})
# Extract values for plotting
idiom_display_names = [item["idiom_name"] for item in ordered_data]
score_values = [item["score"] for item in ordered_data]
idiom_codes = [item["idiom_code"] for item in ordered_data]
# Set colors based on detected idiom
colors = ["#3062FF" if code == detected_idiom else "#BDC9E8" for code in idiom_codes]
fig = go.Figure(data=[
go.Bar(
y=idiom_display_names, # Use display names for idioms
x=score_values,
marker_color=colors,
orientation='h', # Set horizontal orientation
width=0.4 # Make bars narrower (height in horizontal orientation)
)
])
fig.update_layout(
height=400,
plot_bgcolor='#FAFAFA',
paper_bgcolor='#FAFAFA',
xaxis=dict(
title="(Number of words found in Pledari Grond)",
title_font=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)',
size=12
),
tickformat='.1f', # Format tick labels with 1 decimal place
ticksuffix='%', # Add % suffix to tick labels
tickfont=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)'
)
),
yaxis=dict(
ticksuffix=' ', # Add space between idiom labels and bars
tickfont=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)'
)
),
font=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)'
),
)
# Update hover template to show percentages
fig.update_traces(
hovertemplate='%{y}: %{x:.1f}%<extra></extra>'
)
# No need to return detected idiom anymore
return fig, df_tokens
with gr.Blocks(
title="Lemmatizer",
css="""
/* ===== Table Styling ===== */
#full-width-table .wrap.svelte-drum8y,
#full-width-table table {
width: 100% !important;
table-layout: auto !important;
}
#full-width-table td,
#full-width-table th {
white-space: nowrap !important;
}
/* === Specific column width adjustments === */
#full-width-table table th:nth-child(1),
#full-width-table table td:nth-child(1) {
min-width: 200px !important; /* Word column */
}
#full-width-table table th:nth-child(2),
#full-width-table table td:nth-child(2) {
min-width: 200px !important; /* Lemma column */
}
#full-width-table table th:nth-child(3),
#full-width-table table td:nth-child(3) {
min-width: 200px !important; /* German translations column */
}
#full-width-table table th:nth-child(4),
#full-width-table table td:nth-child(4) {
min-width: 300px !important; /* Morphological Analysis column */
}
/* ===== Input box height control ===== */
#input-box {
display: flex !important;
flex-direction: column !important;
height: 360px !important; /* visually matches plot height ~400px */
overflow: hidden !important;
}
#input-box textarea {
flex-grow: 1 !important;
height: 100% !important;
max-height: 100% !important;
overflow-y: auto !important;
resize: none !important;
}
"""
) as demo:
gr.Markdown(
"# Romansh Lemmatizer"
"<sup style='color:#FF5252;font-size:0.4em;vertical-align:super'>(BETA)</sup>"
)
with gr.Accordion("About", open=False):
gr.Markdown(readme_content)
# === Top Row: Input & Chart ===
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter Romansh text here...",
value="La vulp era puspè ina giada fomentada.",
lines=5
)
submit_btn = gr.Button("Analyze")
with gr.Column(scale=2):
idiom_chart = gr.Plot(label="Detected Idioms")
# === Bottom Row: Full-width Table ===
token_table = gr.DataFrame(
label="Analysis of Words",
datatype="markdown",
wrap=False, # prevent Gradio from wrapping text
elem_id="full-width-table"
)
# === Function Hook ===
submit_btn.click(
fn=process_text,
inputs=[text_input],
outputs=[idiom_chart, token_table]
)
# Add examples from TSV file
# Read examples from the TSV file
tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv")
# Read the TSV file into a pandas DataFrame
df = pd.read_csv(tsv_path, sep='\t')
# Create a list of examples with their idiom labels
examples_data = []
for col in df.columns:
for sentence in df[col].dropna():
if sentence.strip(): # Skip empty sentences
examples_data.append((sentence, col))
# Create the Examples component with idiom labels and sentence content
examples = [sentence for sentence, _ in examples_data]
example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data]
gr.Examples(
examples=examples,
inputs=text_input,
label="Example Sentences",
example_labels=example_labels,
examples_per_page=100,
fn=process_text,
outputs=[idiom_chart, token_table],
run_on_click=True,
cache_examples=True,
cache_mode='eager',
preload=0,
)
if __name__ == "__main__":
demo.launch()
|