simon-clmtd commited on
Commit
e054c82
·
verified ·
1 Parent(s): affc40d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -21
app.py CHANGED
@@ -40,16 +40,28 @@ with gr.Blocks(title="Solr Normalization Demo") as demo:
40
  gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)
41
 
42
  gr.Markdown(
43
- """
44
- # 🧹 Solr Normalization Pipeline Demo
45
-
46
- **Solr normalization** is meant to demonstrate how text is normalized in the **Impresso** project.
47
- This pipeline replicates Solr's text processing functionality, showing how text goes through various
48
- analyzers including tokenization, stopword removal, and language-specific transformations.
49
-
50
- Try the example below or enter your own text to see how it gets processed!
51
- """
52
- )
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  with gr.Row():
55
  with gr.Column():
@@ -78,17 +90,24 @@ with gr.Blocks(title="Solr Normalization Demo") as demo:
78
 
79
  # Info modal/accordion for pipeline details
80
  with gr.Accordion("📝 About the Pipeline", open=False, visible=False) as info_accordion:
81
- gr.Markdown(
82
- """
83
- - **Tokenization**: Splits text into individual tokens
84
- - **Tokenfilter**: Applies various transformations like:
85
- - elision: removes leading apostrophes and articles in languages like French and Italian
86
- - lowercase: converts to lowercase
87
- - asciifolding: converts accented characters to ASCII
88
- - stop: removes common stopwords
89
- - stemmer: reduces words to a common base or stem, improving recall in search
90
- - normalization: applies language-specific normalization
91
- """
 
 
 
 
 
 
 
92
  )
93
 
94
  submit_btn.click(
 
40
  gr.Image("logo.jpeg", label=None, show_label=False, container=False, height=100)
41
 
42
  gr.Markdown(
43
+ """
44
+ # 🧹 Solr Normalization Pipeline Demo
45
+
46
+ This demo showcases the **Solr Normalization Pipeline**, which replicates the text preprocessing steps applied by Solr during indexing to help you understand how raw input is transformed before becoming searchable.
47
+
48
+ The pipeline applies:
49
+ - **Tokenization** (splitting text into searchable units)
50
+ - **Stopword removal** (filtering out common, uninformative words)
51
+ - **Lowercasing and normalization**
52
+ - **Language-specific filters** (e.g., stemming, elision)
53
+
54
+ These steps are crucial for improving **search recall** and maintaining **linguistic consistency** across large, multilingual corpora.
55
+
56
+ 🧠 **Why is this useful?**
57
+
58
+ - It explains why search results might not exactly match the words you entered.
59
+ - It shows how different word forms are **collapsed** into searchable stems.
60
+ - It helps interpret unexpected matches (or mismatches) when querying historical text collections.
61
+
62
+ You can try the example below, or enter your own text to explore how it is normalized behind the scenes.
63
+ """
64
+ )
65
 
66
  with gr.Row():
67
  with gr.Column():
 
90
 
91
  # Info modal/accordion for pipeline details
92
  with gr.Accordion("📝 About the Pipeline", open=False, visible=False) as info_accordion:
93
+ gr.Markdown("""
94
+ This pipeline mirrors the standard **Solr analyzer sequence** used in the Impresso project’s indexing infrastructure. It helps interpret how raw text is processed before being indexed.
95
+
96
+ #### Key Components:
97
+ - **Tokenization**: Splits input text into individual word units (tokens).
98
+ - **Token Filters**: Applies a series of language-aware transformations, including:
99
+ - `elision`: Removes leading apostrophes/articles (e.g., *l’homme* → *homme*).
100
+ - `lowercase`: Converts tokens to lowercase.
101
+ - `asciifolding`: Converts accented characters to basic ASCII (e.g., *é* *e*).
102
+ - `stop`: Removes common stopwords (e.g., *the*, *and*, *le*).
103
+ - `stemmer`: Reduces words to their root form (e.g., *running* → *run*).
104
+ - `normalization`: Applies custom language-specific rules.
105
+
106
+ #### Use Cases:
107
+ - Understand how language-specific rules impact search.
108
+ - Evaluate the effect of stopwords, stemming, and normalization.
109
+ - Debug or fine-tune analyzer behavior for multilingual corpora.
110
+ """
111
  )
112
 
113
  submit_btn.click(