Spaces:

impresso-project
/

solr-normalization-demo

Sleeping

App Files Files Community

solr-normalization-demo / app.py

simon-clmtd

Update app.py

16b3687 verified 3 months ago

raw

history blame

5.32 kB

	import os

	# Redirect cache to a writable path inside container
	os.environ["XDG_CACHE_HOME"] = "/tmp/.cache"

	import gradio as gr
	from impresso_pipelines.solrnormalization import SolrNormalizationPipeline

	pipeline = SolrNormalizationPipeline()

	LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"]

	# Example text and default language
	EXAMPLE_TEXT = "The quick brown fox jumps over the lazy dog. This is a sample text for demonstration purposes."
	DEFAULT_LANGUAGE = "en"

	def normalize(text, lang_choice):
	try:
	lang = None if lang_choice == "Auto-detect" else lang_choice
	result = pipeline(text, lang=lang, diagnostics=True)

	# Format analyzer pipeline for better readability
	analyzer_steps = []
	if 'analyzer_pipeline' in result and result['analyzer_pipeline']:
	for i, step in enumerate(result['analyzer_pipeline'], 1):
	step_type = step.get('type', 'unknown')
	step_name = step.get('name', 'unnamed')
	analyzer_steps.append(f" {i}. {step_type}: {step_name}")

	analyzer_display = "\n".join(analyzer_steps) if analyzer_steps else " No analyzer steps found"

	return f"🌍 Language: {result['language']}\n\n🔤 Tokens:\n{result['tokens']}\n\n🚫 Detected stopwords:\n{result['stopwords_detected']}\n\n⚙️ Analyzer pipeline:\n{analyzer_display}"
	except Exception as e:
	print("❌ Pipeline error:", e)
	return f"Error: {e}"

	# Create the interface with logo and improved description
	with gr.Blocks(title="Solr Normalization Demo") as demo:
	# Add logo at the top
	gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)

	gr.Markdown(
	"""
	# 🧹 Solr Normalization Pipeline Demo

	This demo showcases the Solr Normalization Pipeline, which replicates the text preprocessing steps applied by Solr during indexing to help you understand how raw input is transformed before becoming searchable.

	The pipeline applies:
	- Tokenization (splitting text into searchable units)
	- Stopword removal (filtering out common, uninformative words)
	- Lowercasing and normalization
	- Language-specific filters (e.g., stemming, elision)

	These steps are crucial for improving search recall and maintaining linguistic consistency across large, multilingual corpora.

	🧠 Why is this useful?

	- It explains why search results might not exactly match the words you entered.
	- It shows how different word forms are collapsed into searchable stems.
	- It helps interpret unexpected matches (or mismatches) when querying historical text collections.

	You can try the example below, or enter your own text to explore how it is normalized behind the scenes.
	"""
	)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Enter Text",
	value=EXAMPLE_TEXT,
	lines=3,
	placeholder="Enter your text here..."
	)
	lang_dropdown = gr.Dropdown(
	choices=["Auto-detect"] + LANGUAGES,
	value=DEFAULT_LANGUAGE,
	label="Language"
	)
	submit_btn = gr.Button("🚀 Normalize Text", variant="primary")
	info_btn = gr.Button("Help", size="sm", scale=1)

	with gr.Column():
	with gr.Row():
	output = gr.Textbox(
	label="Normalized Output",
	lines=15,
	placeholder="Results will appear here...",
	scale=10
	)


	# Info modal/accordion for pipeline details
	with gr.Accordion("📝 Help", open=False, visible=False) as info_accordion:
	gr.Markdown("""
	This pipeline mirrors the standard Solr analyzer sequence used in the Impresso project’s indexing infrastructure. It helps interpret how raw text is processed before being indexed.

	#### Key Components:
	- Tokenization: Splits input text into individual word units (tokens).
	- Token Filters: Applies a series of language-aware transformations, including:
	- `elision`: Removes leading apostrophes/articles (e.g., l’homme → homme).
	- `lowercase`: Converts tokens to lowercase.
	- `asciifolding`: Converts accented characters to basic ASCII (e.g., é → e).
	- `stop`: Removes common stopwords (e.g., the, and, le).
	- `stemmer`: Reduces words to their root form (e.g., running → run).
	- `normalization`: Applies custom language-specific rules.

	#### Use Cases:
	- Understand how language-specific rules impact search.
	- Evaluate the effect of stopwords, stemming, and normalization.
	- Debug or fine-tune analyzer behavior for multilingual corpora.
	"""
	)

	submit_btn.click(
	fn=normalize,
	inputs=[text_input, lang_dropdown],
	outputs=output
	)

	# Toggle info visibility when info button is clicked
	info_btn.click(
	fn=lambda: gr.Accordion(visible=True, open=True),
	outputs=info_accordion
	)

	demo.launch(server_name="0.0.0.0", server_port=7860)