jvamvas's picture
Display readme in app
e4a3a66
import csv
import os
import sys
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
from lemmatizer import Lemmatizer
#csv.field_size_limit(sys.maxsize)
csv.field_size_limit(csv.field_size_limit(2**31 - 1) )
def load_readme():
"""Load README.md content and strip YAML frontmatter."""
readme_path = os.path.join(os.path.dirname(__file__), "README.md")
with open(readme_path, "r", encoding="utf-8") as file:
content = file.read()
# Strip YAML frontmatter (content between --- markers)
if content.startswith("---"):
# Find the second occurrence of ---
lines = content.split("\n")
frontmatter_end = None
for index, line in enumerate(lines[1:], start=1):
if line.strip() == "---":
frontmatter_end = index
break
if frontmatter_end is not None:
# Return content after frontmatter, skipping the blank line if present
content = "\n".join(lines[frontmatter_end + 1:]).lstrip("\n")
return content
readme_content = load_readme()
if gr.NO_RELOAD:
lemmatizer = Lemmatizer(leanred_et=False)
def process_text(text):
doc = lemmatizer(text)
idiom_scores = doc.idiom_scores
detected_idiom = doc.idiom.value
# Create a list to store token analyses
token_analyses = []
for token in doc.tokens:
token_info = {
"token": token.text,
"lemmas": {}
}
for lemma, analyses in token.lemmas.items():
# Initialize lemma entry
if lemma.text not in token_info["lemmas"]:
token_info["lemmas"][lemma.text] = {
"analyses": [],
"translations": []
}
# Collect analyses
for analysis in analyses:
try:
analysis_str = str(analysis)
except AttributeError:
analysis_str = "-"
token_info["lemmas"][lemma.text]["analyses"].append(analysis_str)
# Collect lemma-specific translation
if getattr(lemma, "translation_de", None) and lemma.translation_de != "null":
token_info["lemmas"][lemma.text]["translations"].append(lemma.translation_de)
token_analyses.append(token_info)
# Create DataFrame for token analysis
df_tokens = pd.DataFrame([
{
"Token": t["token"],
"Lemma": "<br>".join([f"<b>{lemma}</b>" for lemma in t["lemmas"].keys()]),
"German translations": "<br>".join([
f"<b>{lemma}</b>:\n" +
"<br>".join([
f"<span style='font-style: italic; color: #B0B0B0;'>{tr}</span>"
for tr in sorted(
lem_data["translations"],
key=lambda x: (len(x), x.lower())
)[:10] # limit to 10 translations per lemma
])
for lemma, lem_data in t["lemmas"].items() if lem_data["translations"]
]),
"Morphological Analysis": "<br>".join([
f"<b>{lemma}</b>: " +
"<br>".join(sorted(set(lem_data["analyses"])))
for lemma, lem_data in t["lemmas"].items() if lem_data["analyses"]
])
}
for t in token_analyses
])
# Create bar chart data for idiom scores using plotly
# Define idiom display names and order
idiom_map = {
"rm-rumgr": "Rumantsch Grischun",
"rm-sursilv": "Sursilvan",
"rm-sutsilv": "Sutsilvan",
"rm-surmiran": "Surmiran",
"rm-puter": "Puter",
"rm-vallader": "Vallader",
}
# Create ordered list of idioms (reversed for display since the chart plots from bottom to top)
ordered_idioms = ["rm-vallader", "rm-puter", "rm-surmiran", "rm-sutsilv", "rm-sursilv", "rm-rumgr"]
# Create ordered data for the chart
ordered_data = []
for idiom_code in ordered_idioms:
# Find the corresponding Idiom enum value in the keys
matching_idioms = [i for i in idiom_scores.keys() if i.value == idiom_code]
if matching_idioms:
score = idiom_scores[matching_idioms[0]]
ordered_data.append({
"idiom_code": idiom_code,
"idiom_name": idiom_map[idiom_code],
"score": round(score * 100, 1)
})
# Extract values for plotting
idiom_display_names = [item["idiom_name"] for item in ordered_data]
score_values = [item["score"] for item in ordered_data]
idiom_codes = [item["idiom_code"] for item in ordered_data]
# Set colors based on detected idiom
colors = ["#3062FF" if code == detected_idiom else "#BDC9E8" for code in idiom_codes]
fig = go.Figure(data=[
go.Bar(
y=idiom_display_names, # Use display names for idioms
x=score_values,
marker_color=colors,
orientation='h', # Set horizontal orientation
width=0.4 # Make bars narrower (height in horizontal orientation)
)
])
fig.update_layout(
height=400,
plot_bgcolor='#FAFAFA',
paper_bgcolor='#FAFAFA',
xaxis=dict(
title="(Number of words found in Pledari Grond)",
title_font=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)',
size=12
),
tickformat='.1f', # Format tick labels with 1 decimal place
ticksuffix='%', # Add % suffix to tick labels
tickfont=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)'
)
),
yaxis=dict(
ticksuffix=' ', # Add space between idiom labels and bars
tickfont=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)'
)
),
font=dict(
family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
color='rgb(39, 39, 42)'
),
)
# Update hover template to show percentages
fig.update_traces(
hovertemplate='%{y}: %{x:.1f}%<extra></extra>'
)
# No need to return detected idiom anymore
return fig, df_tokens
with gr.Blocks(
title="Lemmatizer",
css="""
/* ===== Table Styling ===== */
#full-width-table .wrap.svelte-drum8y,
#full-width-table table {
width: 100% !important;
table-layout: auto !important;
}
#full-width-table td,
#full-width-table th {
white-space: nowrap !important;
}
/* === Specific column width adjustments === */
#full-width-table table th:nth-child(1),
#full-width-table table td:nth-child(1) {
min-width: 200px !important; /* Word column */
}
#full-width-table table th:nth-child(2),
#full-width-table table td:nth-child(2) {
min-width: 200px !important; /* Lemma column */
}
#full-width-table table th:nth-child(3),
#full-width-table table td:nth-child(3) {
min-width: 200px !important; /* German translations column */
}
#full-width-table table th:nth-child(4),
#full-width-table table td:nth-child(4) {
min-width: 300px !important; /* Morphological Analysis column */
}
/* ===== Input box height control ===== */
#input-box {
display: flex !important;
flex-direction: column !important;
height: 360px !important; /* visually matches plot height ~400px */
overflow: hidden !important;
}
#input-box textarea {
flex-grow: 1 !important;
height: 100% !important;
max-height: 100% !important;
overflow-y: auto !important;
resize: none !important;
}
"""
) as demo:
gr.Markdown(
"# Romansh Lemmatizer"
"<sup style='color:#FF5252;font-size:0.4em;vertical-align:super'>(BETA)</sup>"
)
with gr.Accordion("About", open=False):
gr.Markdown(readme_content)
# === Top Row: Input & Chart ===
with gr.Row():
with gr.Column(scale=1):
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter Romansh text here...",
value="La vulp era puspè ina giada fomentada.",
lines=5
)
submit_btn = gr.Button("Analyze")
with gr.Column(scale=2):
idiom_chart = gr.Plot(label="Detected Idioms")
# === Bottom Row: Full-width Table ===
token_table = gr.DataFrame(
label="Analysis of Words",
datatype="markdown",
wrap=False, # prevent Gradio from wrapping text
elem_id="full-width-table"
)
# === Function Hook ===
submit_btn.click(
fn=process_text,
inputs=[text_input],
outputs=[idiom_chart, token_table]
)
# Add examples from TSV file
# Read examples from the TSV file
tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv")
# Read the TSV file into a pandas DataFrame
df = pd.read_csv(tsv_path, sep='\t')
# Create a list of examples with their idiom labels
examples_data = []
for col in df.columns:
for sentence in df[col].dropna():
if sentence.strip(): # Skip empty sentences
examples_data.append((sentence, col))
# Create the Examples component with idiom labels and sentence content
examples = [sentence for sentence, _ in examples_data]
example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data]
gr.Examples(
examples=examples,
inputs=text_input,
label="Example Sentences",
example_labels=example_labels,
examples_per_page=100,
fn=process_text,
outputs=[idiom_chart, token_table],
run_on_click=True,
cache_examples=True,
cache_mode='eager',
preload=0,
)
if __name__ == "__main__":
demo.launch()