Spaces:

ZurichNLP
/

romansh-lemmatizer

Running

App Files Files Community

romansh-lemmatizer / app.py

jvamvas

Display readme in app

e4a3a66 3 days ago

raw

history blame contribute delete

10.2 kB

	import csv
	import os
	import sys

	import gradio as gr
	import pandas as pd
	import plotly.graph_objects as go
	from lemmatizer import Lemmatizer

	#csv.field_size_limit(sys.maxsize)
	csv.field_size_limit(csv.field_size_limit(2**31 - 1) )


	def load_readme():
	"""Load README.md content and strip YAML frontmatter."""
	readme_path = os.path.join(os.path.dirname(__file__), "README.md")
	with open(readme_path, "r", encoding="utf-8") as file:
	content = file.read()

	# Strip YAML frontmatter (content between --- markers)
	if content.startswith("---"):
	# Find the second occurrence of ---
	lines = content.split("\n")
	frontmatter_end = None
	for index, line in enumerate(lines[1:], start=1):
	if line.strip() == "---":
	frontmatter_end = index
	break

	if frontmatter_end is not None:
	# Return content after frontmatter, skipping the blank line if present
	content = "\n".join(lines[frontmatter_end + 1:]).lstrip("\n")

	return content


	readme_content = load_readme()

	if gr.NO_RELOAD:
	lemmatizer = Lemmatizer(leanred_et=False)

	def process_text(text):
	doc = lemmatizer(text)

	idiom_scores = doc.idiom_scores
	detected_idiom = doc.idiom.value

	# Create a list to store token analyses
	token_analyses = []

	for token in doc.tokens:
	token_info = {
	"token": token.text,
	"lemmas": {}
	}

	for lemma, analyses in token.lemmas.items():
	# Initialize lemma entry
	if lemma.text not in token_info["lemmas"]:
	token_info["lemmas"][lemma.text] = {
	"analyses": [],
	"translations": []
	}

	# Collect analyses
	for analysis in analyses:
	try:
	analysis_str = str(analysis)
	except AttributeError:
	analysis_str = "-"
	token_info["lemmas"][lemma.text]["analyses"].append(analysis_str)

	# Collect lemma-specific translation
	if getattr(lemma, "translation_de", None) and lemma.translation_de != "null":
	token_info["lemmas"][lemma.text]["translations"].append(lemma.translation_de)

	token_analyses.append(token_info)


	# Create DataFrame for token analysis
	df_tokens = pd.DataFrame([
	{
	"Token": t["token"],
	"Lemma": "<br>".join([f"<b>{lemma}</b>" for lemma in t["lemmas"].keys()]),
	"German translations": "<br>".join([
	f"<b>{lemma}</b>:\n" +
	"<br>".join([
	f"<span style='font-style: italic; color: #B0B0B0;'>{tr}</span>"
	for tr in sorted(
	lem_data["translations"],
	key=lambda x: (len(x), x.lower())
	)[:10] # limit to 10 translations per lemma
	])
	for lemma, lem_data in t["lemmas"].items() if lem_data["translations"]
	]),
	"Morphological Analysis": "<br>".join([
	f"<b>{lemma}</b>: " +
	"<br>".join(sorted(set(lem_data["analyses"])))
	for lemma, lem_data in t["lemmas"].items() if lem_data["analyses"]
	])
	}
	for t in token_analyses
	])



	# Create bar chart data for idiom scores using plotly

	# Define idiom display names and order
	idiom_map = {
	"rm-rumgr": "Rumantsch Grischun",
	"rm-sursilv": "Sursilvan",
	"rm-sutsilv": "Sutsilvan",
	"rm-surmiran": "Surmiran",
	"rm-puter": "Puter",
	"rm-vallader": "Vallader",
	}

	# Create ordered list of idioms (reversed for display since the chart plots from bottom to top)
	ordered_idioms = ["rm-vallader", "rm-puter", "rm-surmiran", "rm-sutsilv", "rm-sursilv", "rm-rumgr"]

	# Create ordered data for the chart
	ordered_data = []
	for idiom_code in ordered_idioms:
	# Find the corresponding Idiom enum value in the keys
	matching_idioms = [i for i in idiom_scores.keys() if i.value == idiom_code]
	if matching_idioms:
	score = idiom_scores[matching_idioms[0]]
	ordered_data.append({
	"idiom_code": idiom_code,
	"idiom_name": idiom_map[idiom_code],
	"score": round(score * 100, 1)
	})

	# Extract values for plotting
	idiom_display_names = [item["idiom_name"] for item in ordered_data]
	score_values = [item["score"] for item in ordered_data]
	idiom_codes = [item["idiom_code"] for item in ordered_data]

	# Set colors based on detected idiom
	colors = ["#3062FF" if code == detected_idiom else "#BDC9E8" for code in idiom_codes]

	fig = go.Figure(data=[
	go.Bar(
	y=idiom_display_names, # Use display names for idioms
	x=score_values,
	marker_color=colors,
	orientation='h', # Set horizontal orientation
	width=0.4 # Make bars narrower (height in horizontal orientation)
	)
	])

	fig.update_layout(
	height=400,
	plot_bgcolor='#FAFAFA',
	paper_bgcolor='#FAFAFA',
	xaxis=dict(
	title="(Number of words found in Pledari Grond)",
	title_font=dict(
	family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
	color='rgb(39, 39, 42)',
	size=12
	),
	tickformat='.1f', # Format tick labels with 1 decimal place
	ticksuffix='%', # Add % suffix to tick labels
	tickfont=dict(
	family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
	color='rgb(39, 39, 42)'
	)
	),
	yaxis=dict(
	ticksuffix=' ', # Add space between idiom labels and bars
	tickfont=dict(
	family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
	color='rgb(39, 39, 42)'
	)
	),
	font=dict(
	family='"IBM Plex Mono", ui-monospace, Consolas, monospace',
	color='rgb(39, 39, 42)'
	),
	)

	# Update hover template to show percentages
	fig.update_traces(
	hovertemplate='%{y}: %{x:.1f}%<extra></extra>'
	)

	# No need to return detected idiom anymore
	return fig, df_tokens

	with gr.Blocks(
	title="Lemmatizer",
	css="""
	/* ===== Table Styling ===== */
	#full-width-table .wrap.svelte-drum8y,
	#full-width-table table {
	width: 100% !important;
	table-layout: auto !important;
	}

	#full-width-table td,
	#full-width-table th {
	white-space: nowrap !important;
	}

	/* === Specific column width adjustments === */
	#full-width-table table th:nth-child(1),
	#full-width-table table td:nth-child(1) {
	min-width: 200px !important; /* Word column */
	}

	#full-width-table table th:nth-child(2),
	#full-width-table table td:nth-child(2) {
	min-width: 200px !important; /* Lemma column */
	}

	#full-width-table table th:nth-child(3),
	#full-width-table table td:nth-child(3) {
	min-width: 200px !important; /* German translations column */
	}

	#full-width-table table th:nth-child(4),
	#full-width-table table td:nth-child(4) {
	min-width: 300px !important; /* Morphological Analysis column */
	}

	/* ===== Input box height control ===== */
	#input-box {
	display: flex !important;
	flex-direction: column !important;
	height: 360px !important; /* visually matches plot height ~400px */
	overflow: hidden !important;
	}

	#input-box textarea {
	flex-grow: 1 !important;
	height: 100% !important;
	max-height: 100% !important;
	overflow-y: auto !important;
	resize: none !important;
	}

	"""
	) as demo:


	gr.Markdown(
	"# Romansh Lemmatizer"
	"<sup style='color:#FF5252;font-size:0.4em;vertical-align:super'>(BETA)</sup>"
	)

	with gr.Accordion("About", open=False):
	gr.Markdown(readme_content)

	# === Top Row: Input & Chart ===
	with gr.Row():
	with gr.Column(scale=1):
	text_input = gr.Textbox(
	label="Input Text",
	placeholder="Enter Romansh text here...",
	value="La vulp era puspè ina giada fomentada.",
	lines=5
	)
	submit_btn = gr.Button("Analyze")

	with gr.Column(scale=2):
	idiom_chart = gr.Plot(label="Detected Idioms")

	# === Bottom Row: Full-width Table ===
	token_table = gr.DataFrame(
	label="Analysis of Words",
	datatype="markdown",
	wrap=False, # prevent Gradio from wrapping text
	elem_id="full-width-table"
	)

	# === Function Hook ===
	submit_btn.click(
	fn=process_text,
	inputs=[text_input],
	outputs=[idiom_chart, token_table]
	)


	# Add examples from TSV file
	# Read examples from the TSV file
	tsv_path = os.path.join(os.path.dirname(__file__), "example_sentences.tsv")
	# Read the TSV file into a pandas DataFrame
	df = pd.read_csv(tsv_path, sep='\t')

	# Create a list of examples with their idiom labels
	examples_data = []
	for col in df.columns:
	for sentence in df[col].dropna():
	if sentence.strip(): # Skip empty sentences
	examples_data.append((sentence, col))

	# Create the Examples component with idiom labels and sentence content
	examples = [sentence for sentence, _ in examples_data]
	example_labels = [f"[{idiom}:] {sentence}" for sentence, idiom in examples_data]

	gr.Examples(
	examples=examples,
	inputs=text_input,
	label="Example Sentences",
	example_labels=example_labels,
	examples_per_page=100,
	fn=process_text,
	outputs=[idiom_chart, token_table],
	run_on_click=True,
	cache_examples=True,
	cache_mode='eager',
	preload=0,
	)


	if __name__ == "__main__":
	demo.launch()