|
|
import os |
|
|
|
|
|
|
|
|
os.environ["XDG_CACHE_HOME"] = "/tmp/.cache" |
|
|
|
|
|
import gradio as gr |
|
|
from impresso_pipelines.solrnormalization import SolrNormalizationPipeline |
|
|
|
|
|
pipeline = SolrNormalizationPipeline() |
|
|
|
|
|
LANGUAGES = ["de", "fr", "es", "it", "pt", "nl", "en", "general"] |
|
|
|
|
|
|
|
|
EXAMPLE_TEXT = "The quick brown fox jumps over the lazy dog. This is a sample text for demonstration purposes." |
|
|
DEFAULT_LANGUAGE = "en" |
|
|
|
|
|
def normalize(text, lang_choice): |
|
|
try: |
|
|
lang = None if lang_choice == "Auto-detect" else lang_choice |
|
|
result = pipeline(text, lang=lang, diagnostics=True) |
|
|
|
|
|
|
|
|
analyzer_steps = [] |
|
|
if 'analyzer_pipeline' in result and result['analyzer_pipeline']: |
|
|
for i, step in enumerate(result['analyzer_pipeline'], 1): |
|
|
step_type = step.get('type', 'unknown') |
|
|
step_name = step.get('name', 'unnamed') |
|
|
analyzer_steps.append(f" {i}. {step_type}: {step_name}") |
|
|
|
|
|
analyzer_display = "\n".join(analyzer_steps) if analyzer_steps else " No analyzer steps found" |
|
|
|
|
|
return f"🌍 Language: {result['language']}\n\n🔤 Tokens:\n{result['tokens']}\n\n🚫 Detected stopwords:\n{result['stopwords_detected']}\n\n⚙️ Analyzer pipeline:\n{analyzer_display}" |
|
|
except Exception as e: |
|
|
print("❌ Pipeline error:", e) |
|
|
return f"Error: {e}" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Solr Normalization Demo") as demo: |
|
|
|
|
|
gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
# 🧹 Solr Normalization Pipeline Demo |
|
|
|
|
|
This demo showcases the **Solr Normalization Pipeline**, which replicates the text preprocessing steps applied by Solr during indexing to help you understand how raw input is transformed before becoming searchable. |
|
|
|
|
|
The pipeline applies: |
|
|
- **Tokenization** (splitting text into searchable units) |
|
|
- **Stopword removal** (filtering out common, uninformative words) |
|
|
- **Lowercasing and normalization** |
|
|
- **Language-specific filters** (e.g., stemming, elision) |
|
|
|
|
|
These steps are crucial for improving **search recall** and maintaining **linguistic consistency** across large, multilingual corpora. |
|
|
|
|
|
🧠 **Why is this useful?** |
|
|
|
|
|
- It explains why search results might not exactly match the words you entered. |
|
|
- It shows how different word forms are **collapsed** into searchable stems. |
|
|
- It helps interpret unexpected matches (or mismatches) when querying historical text collections. |
|
|
|
|
|
You can try the example below, or enter your own text to explore how it is normalized behind the scenes. |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
text_input = gr.Textbox( |
|
|
label="Enter Text", |
|
|
value=EXAMPLE_TEXT, |
|
|
lines=3, |
|
|
placeholder="Enter your text here..." |
|
|
) |
|
|
lang_dropdown = gr.Dropdown( |
|
|
choices=["Auto-detect"] + LANGUAGES, |
|
|
value=DEFAULT_LANGUAGE, |
|
|
label="Language" |
|
|
) |
|
|
submit_btn = gr.Button("🚀 Normalize Text", variant="primary") |
|
|
info_btn = gr.Button("Help", size="sm", scale=1) |
|
|
|
|
|
with gr.Column(): |
|
|
with gr.Row(): |
|
|
output = gr.Textbox( |
|
|
label="Normalized Output", |
|
|
lines=15, |
|
|
placeholder="Results will appear here...", |
|
|
scale=10 |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
with gr.Accordion("📝 Help", open=False, visible=False) as info_accordion: |
|
|
gr.Markdown(""" |
|
|
This pipeline mirrors the standard **Solr analyzer sequence** used in the Impresso project’s indexing infrastructure. It helps interpret how raw text is processed before being indexed. |
|
|
|
|
|
#### Key Components: |
|
|
- **Tokenization**: Splits input text into individual word units (tokens). |
|
|
- **Token Filters**: Applies a series of language-aware transformations, including: |
|
|
- `elision`: Removes leading apostrophes/articles (e.g., *l’homme* → *homme*). |
|
|
- `lowercase`: Converts tokens to lowercase. |
|
|
- `asciifolding`: Converts accented characters to basic ASCII (e.g., *é* → *e*). |
|
|
- `stop`: Removes common stopwords (e.g., *the*, *and*, *le*). |
|
|
- `stemmer`: Reduces words to their root form (e.g., *running* → *run*). |
|
|
- `normalization`: Applies custom language-specific rules. |
|
|
|
|
|
#### Use Cases: |
|
|
- Understand how language-specific rules impact search. |
|
|
- Evaluate the effect of stopwords, stemming, and normalization. |
|
|
- Debug or fine-tune analyzer behavior for multilingual corpora. |
|
|
""" |
|
|
) |
|
|
|
|
|
submit_btn.click( |
|
|
fn=normalize, |
|
|
inputs=[text_input, lang_dropdown], |
|
|
outputs=output |
|
|
) |
|
|
|
|
|
|
|
|
info_btn.click( |
|
|
fn=lambda: gr.Accordion(visible=True, open=True), |
|
|
outputs=info_accordion |
|
|
) |
|
|
|
|
|
demo.launch(server_name="0.0.0.0", server_port=7860) |