Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,6 +19,27 @@ token_classifier = pipeline(
|
|
| 19 |
|
| 20 |
tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Preprocess the 'word' column
|
| 23 |
def preprocess_text(text):
|
| 24 |
# Remove HTML tags
|
|
@@ -75,32 +96,24 @@ def split_text(text, max_tokens=500):
|
|
| 75 |
return chunks
|
| 76 |
|
| 77 |
def transform_chunks(marianne_segmentation):
|
| 78 |
-
|
| 79 |
-
print(marianne_segmentation)
|
| 80 |
-
|
| 81 |
marianne_segmentation = pd.DataFrame(marianne_segmentation)
|
| 82 |
-
|
| 83 |
-
print(marianne_segmentation)
|
| 84 |
-
|
| 85 |
-
# Filter out separators
|
| 86 |
marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
|
| 87 |
-
|
| 88 |
-
# Replace '¶' with '\n' and convert to string
|
| 89 |
marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
|
| 90 |
-
|
| 91 |
-
#A bit of lceaning.
|
| 92 |
marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
|
|
|
|
|
|
| 102 |
|
| 103 |
-
|
|
|
|
| 104 |
|
| 105 |
|
| 106 |
# Class to encapsulate the Falcon chatbot
|
|
@@ -109,22 +122,17 @@ class MistralChatBot:
|
|
| 109 |
self.system_prompt = system_prompt
|
| 110 |
|
| 111 |
def predict(self, user_message):
|
| 112 |
-
|
| 113 |
-
editorial_text = re.sub("\n", " ¶ ", user_message)
|
| 114 |
-
|
| 115 |
-
# Tokenize the prompt and check if it exceeds 500 tokens
|
| 116 |
num_tokens = len(tokenizer.tokenize(editorial_text))
|
| 117 |
-
|
| 118 |
if num_tokens > 500:
|
| 119 |
-
# Split the prompt into chunks
|
| 120 |
batch_prompts = split_text(editorial_text, max_tokens=500)
|
| 121 |
else:
|
| 122 |
batch_prompts = [editorial_text]
|
| 123 |
-
|
| 124 |
out = token_classifier(batch_prompts)
|
| 125 |
out = transform_chunks(out[0])
|
| 126 |
-
|
| 127 |
-
generated_text = '<h2 style="text-align:center">Réponse</h3>\n<div class="generation">' + out + "</div>"
|
| 128 |
return generated_text
|
| 129 |
|
| 130 |
# Create the Falcon chatbot instance
|
|
|
|
| 19 |
|
| 20 |
tokenizer = AutoTokenizer.from_pretrained(editorial_model, model_max_length=512)
|
| 21 |
|
| 22 |
+
css = """
|
| 23 |
+
<style>
|
| 24 |
+
.manuscript {
|
| 25 |
+
display: flex;
|
| 26 |
+
margin-bottom: 20px;
|
| 27 |
+
}
|
| 28 |
+
.annotation {
|
| 29 |
+
width: 30%;
|
| 30 |
+
padding-right: 20px;
|
| 31 |
+
color: grey;
|
| 32 |
+
font-style: italic;
|
| 33 |
+
}
|
| 34 |
+
.content {
|
| 35 |
+
width: 70%;
|
| 36 |
+
}
|
| 37 |
+
h3 {
|
| 38 |
+
margin-top: 0;
|
| 39 |
+
}
|
| 40 |
+
</style>
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
# Preprocess the 'word' column
|
| 44 |
def preprocess_text(text):
|
| 45 |
# Remove HTML tags
|
|
|
|
| 96 |
return chunks
|
| 97 |
|
| 98 |
def transform_chunks(marianne_segmentation):
|
|
|
|
|
|
|
|
|
|
| 99 |
marianne_segmentation = pd.DataFrame(marianne_segmentation)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
marianne_segmentation = marianne_segmentation[marianne_segmentation['entity_group'] != 'separator']
|
|
|
|
|
|
|
| 101 |
marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).str.replace('¶', '\n', regex=False)
|
|
|
|
|
|
|
| 102 |
marianne_segmentation['word'] = marianne_segmentation['word'].astype(str).apply(preprocess_text)
|
| 103 |
+
marianne_segmentation = marianne_segmentation[marianne_segmentation['word'].notna() & (marianne_segmentation['word'] != '') & (marianne_segmentation['word'] != ' ')]
|
| 104 |
+
|
| 105 |
+
html_output = []
|
| 106 |
+
for _, row in marianne_segmentation.iterrows():
|
| 107 |
+
entity_group = row['entity_group']
|
| 108 |
+
word = row['word']
|
| 109 |
+
|
| 110 |
+
if entity_group == 'title':
|
| 111 |
+
html_output.append(f'<div class="manuscript"><div class="annotation">{entity_group}</div><div class="content"><h3>{word}</h3></div></div>')
|
| 112 |
+
else:
|
| 113 |
+
html_output.append(f'<div class="manuscript"><div class="annotation">{entity_group}</div><div class="content">{word}</div></div>')
|
| 114 |
|
| 115 |
+
final_html = '\n'.join(html_output)
|
| 116 |
+
return final_html
|
| 117 |
|
| 118 |
|
| 119 |
# Class to encapsulate the Falcon chatbot
|
|
|
|
| 122 |
self.system_prompt = system_prompt
|
| 123 |
|
| 124 |
def predict(self, user_message):
|
| 125 |
+
editorial_text = re.sub("\n", " ¶ ", user_message)
|
|
|
|
|
|
|
|
|
|
| 126 |
num_tokens = len(tokenizer.tokenize(editorial_text))
|
| 127 |
+
|
| 128 |
if num_tokens > 500:
|
|
|
|
| 129 |
batch_prompts = split_text(editorial_text, max_tokens=500)
|
| 130 |
else:
|
| 131 |
batch_prompts = [editorial_text]
|
| 132 |
+
|
| 133 |
out = token_classifier(batch_prompts)
|
| 134 |
out = transform_chunks(out[0])
|
| 135 |
+
generated_text = f'{css}<h2 style="text-align:center">Réponse</h2>\n<div class="generation">{out}</div>'
|
|
|
|
| 136 |
return generated_text
|
| 137 |
|
| 138 |
# Create the Falcon chatbot instance
|