Spaces:
Running
Running
update langchain
Browse files
app.py
CHANGED
|
@@ -1,197 +1,258 @@
|
|
| 1 |
# app.py
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
-
|
| 5 |
import torch
|
| 6 |
import theme
|
| 7 |
theme = theme.Theme()
|
|
|
|
| 8 |
from huggingface_hub import from_pretrained_keras
|
| 9 |
from tensorflow.keras.applications import EfficientNetB0
|
| 10 |
-
|
| 11 |
import tensorflow as tf
|
| 12 |
from tensorflow import keras
|
|
|
|
| 13 |
from PIL import Image
|
| 14 |
-
from pydantic.v1 import BaseModel, Field
|
| 15 |
import shutil
|
| 16 |
-
import tenacity
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
from
|
|
|
|
|
|
|
|
|
|
| 20 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 21 |
from langchain.prompts import PromptTemplate
|
| 22 |
-
from langchain.chains import RetrievalQA
|
| 23 |
-
from langchain.prompts import ChatPromptTemplate
|
| 24 |
from langchain.schema import StrOutputParser
|
| 25 |
from langchain.schema.runnable import Runnable
|
| 26 |
from langchain.schema.runnable.config import RunnableConfig
|
| 27 |
-
from langchain.chains import
|
| 28 |
-
LLMChain, ConversationalRetrievalChain)
|
| 29 |
-
from langchain.vectorstores import Chroma
|
| 30 |
-
from langchain.memory import ConversationBufferMemory
|
| 31 |
-
from langchain.chains import LLMChain
|
| 32 |
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate
|
| 33 |
-
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate,
|
| 34 |
from langchain.output_parsers import PydanticOutputParser
|
| 35 |
from langchain_community.llms import HuggingFaceHub
|
| 36 |
from langchain_community.document_loaders import WebBaseLoader
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
from
|
| 39 |
-
|
| 40 |
-
custom_title = "<span style='color: rgb(243, 239, 224);'>Green Greta</span>"
|
| 41 |
-
|
| 42 |
|
| 43 |
-
# Cell 1: Image Classification Model
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
model1 = from_pretrained_keras("rocioadlc/efficientnetB0_trash")
|
| 46 |
|
| 47 |
-
# Define class labels
|
| 48 |
class_labels = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
image_array = tf.keras.applications.efficientnet.preprocess_input(image_array)
|
| 56 |
-
# Expand the dimensions to create a batch
|
| 57 |
image_array = tf.expand_dims(image_array, 0)
|
| 58 |
-
#
|
| 59 |
predictions = model1.predict(image_array)
|
|
|
|
|
|
|
| 60 |
category_scores = {}
|
| 61 |
for i, class_label in enumerate(class_labels):
|
| 62 |
category_scores[class_label] = predictions[0][i].item()
|
| 63 |
|
| 64 |
return category_scores
|
| 65 |
|
| 66 |
-
|
| 67 |
image_gradio_app = gr.Interface(
|
| 68 |
fn=predict_image,
|
| 69 |
inputs=gr.Image(label="Image", sources=['upload', 'webcam'], type="pil"),
|
| 70 |
outputs=[gr.Label(label="Result")],
|
| 71 |
-
title=
|
| 72 |
theme=theme
|
| 73 |
)
|
| 74 |
|
| 75 |
-
# Cell 2: ChatBot Model
|
| 76 |
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
user_agent = UserAgent().random
|
| 79 |
header_template = {"User-Agent": user_agent}
|
| 80 |
|
| 81 |
-
#
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
)
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 168 |
chunk_size=1024,
|
| 169 |
chunk_overlap=150,
|
| 170 |
length_function=len
|
| 171 |
)
|
| 172 |
-
docs = text_splitter.split_documents(
|
| 173 |
-
|
|
|
|
| 174 |
embeddings = HuggingFaceEmbeddings(model_name='thenlper/gte-small')
|
| 175 |
-
|
|
|
|
| 176 |
persist_directory = 'docs/chroma/'
|
|
|
|
| 177 |
|
| 178 |
-
#
|
| 179 |
-
shutil.rmtree(persist_directory, ignore_errors=True)
|
| 180 |
vectordb = Chroma.from_documents(
|
| 181 |
documents=docs,
|
| 182 |
embedding=embeddings,
|
| 183 |
persist_directory=persist_directory
|
| 184 |
)
|
| 185 |
-
# define retriever
|
| 186 |
-
retriever = vectordb.as_retriever(search_kwargs={"k": 2}, search_type="mmr")
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
class FinalAnswer(BaseModel):
|
| 189 |
question: str = Field()
|
| 190 |
answer: str = Field()
|
| 191 |
|
| 192 |
-
# Assuming you have a parser for the FinalAnswer class
|
| 193 |
parser = PydanticOutputParser(pydantic_object=FinalAnswer)
|
| 194 |
|
|
|
|
| 195 |
template = """
|
| 196 |
Your name is Greta and you are a recycling chatbot with the objective to anwer questions from user in English or Spanish /
|
| 197 |
Has sido diseñado y creado por el Grupo 1 del Máster en Data Science & Big Data de la promoción 2023/2024 de la Universidad Complutense de Madrid. Este grupo está fromado por Rocío, María Guillermo, Alejandra, Paloma y Álvaro /
|
|
@@ -205,14 +266,16 @@ User: {question}
|
|
| 205 |
{format_instructions}
|
| 206 |
"""
|
| 207 |
|
| 208 |
-
# Create the chat prompt templates
|
| 209 |
sys_prompt = SystemMessagePromptTemplate.from_template(template)
|
| 210 |
qa_prompt = ChatPromptTemplate(
|
| 211 |
messages=[
|
| 212 |
sys_prompt,
|
| 213 |
-
HumanMessagePromptTemplate.from_template("{question}")
|
|
|
|
| 214 |
partial_variables={"format_instructions": parser.get_format_instructions()}
|
| 215 |
)
|
|
|
|
|
|
|
| 216 |
llm = HuggingFaceHub(
|
| 217 |
repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
|
| 218 |
task="text-generation",
|
|
@@ -224,42 +287,55 @@ llm = HuggingFaceHub(
|
|
| 224 |
},
|
| 225 |
)
|
| 226 |
|
|
|
|
| 227 |
qa_chain = ConversationalRetrievalChain.from_llm(
|
| 228 |
-
llm
|
| 229 |
-
memory
|
| 230 |
-
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
combine_docs_chain_kwargs={'prompt': qa_prompt},
|
| 233 |
-
get_chat_history
|
| 234 |
-
rephrase_question
|
| 235 |
-
output_key
|
| 236 |
)
|
| 237 |
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
result = qa_chain.invoke({'question': question})
|
| 240 |
output_string = result['output']
|
| 241 |
|
| 242 |
-
# Find the index of the last occurrence of "answer": in the string
|
| 243 |
answer_index = output_string.rfind('"answer":')
|
| 244 |
-
|
| 245 |
-
# Extract the substring starting from the "answer": index
|
| 246 |
answer_part = output_string[answer_index + len('"answer":'):].strip()
|
| 247 |
|
| 248 |
# Find the next occurrence of a double quote to get the start of the answer value
|
| 249 |
quote_index = answer_part.find('"')
|
| 250 |
-
|
| 251 |
-
# Extract the answer value between double quotes
|
| 252 |
answer_value = answer_part[quote_index + 1:answer_part.find('"', quote_index + 1)]
|
| 253 |
|
| 254 |
return answer_value
|
| 255 |
|
| 256 |
|
|
|
|
| 257 |
chatbot_gradio_app = gr.ChatInterface(
|
| 258 |
fn=chat_interface,
|
| 259 |
-
title=
|
| 260 |
)
|
| 261 |
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
banner_tab_content = """
|
| 264 |
<div style="background-color: #d3e3c3; text-align: center; padding: 20px; display: flex; flex-direction: column; align-items: center;">
|
| 265 |
<img src="https://huggingface.co/spaces/ALVHB95/TFM_DataScience_APP/resolve/main/front_4.jpg" alt="Banner Image" style="width: 50%; max-width: 500px; margin: 0 auto;">
|
|
@@ -283,12 +359,18 @@ banner_tab_content = """
|
|
| 283 |
"""
|
| 284 |
banner_tab = gr.Markdown(banner_tab_content)
|
| 285 |
|
| 286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
app = gr.TabbedInterface(
|
| 288 |
[banner_tab, image_gradio_app, chatbot_gradio_app],
|
| 289 |
tab_names=["Welcome to Green Greta", "Green Greta Image Classification", "Green Greta Chat"],
|
| 290 |
theme=theme
|
| 291 |
)
|
| 292 |
|
|
|
|
| 293 |
app.queue()
|
| 294 |
-
app.launch()
|
|
|
|
| 1 |
# app.py
|
| 2 |
|
| 3 |
+
"""
|
| 4 |
+
=========================================================
|
| 5 |
+
1) IMPORTS & DEPENDENCIES
|
| 6 |
+
=========================================================
|
| 7 |
+
"""
|
| 8 |
import gradio as gr
|
|
|
|
| 9 |
import torch
|
| 10 |
import theme
|
| 11 |
theme = theme.Theme()
|
| 12 |
+
|
| 13 |
from huggingface_hub import from_pretrained_keras
|
| 14 |
from tensorflow.keras.applications import EfficientNetB0
|
|
|
|
| 15 |
import tensorflow as tf
|
| 16 |
from tensorflow import keras
|
| 17 |
+
|
| 18 |
from PIL import Image
|
|
|
|
| 19 |
import shutil
|
|
|
|
| 20 |
|
| 21 |
+
import tenacity # for retrying failed requests
|
| 22 |
+
from fake_useragent import UserAgent
|
| 23 |
+
|
| 24 |
+
# LangChain
|
| 25 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 26 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 27 |
from langchain.prompts import PromptTemplate
|
|
|
|
|
|
|
| 28 |
from langchain.schema import StrOutputParser
|
| 29 |
from langchain.schema.runnable import Runnable
|
| 30 |
from langchain.schema.runnable.config import RunnableConfig
|
| 31 |
+
from langchain.chains import RetrievalQA, ConversationalRetrievalChain, LLMChain
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
from langchain.prompts.chat import ChatPromptTemplate, SystemMessagePromptTemplate
|
| 33 |
+
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate, MessagesPlaceholder
|
| 34 |
from langchain.output_parsers import PydanticOutputParser
|
| 35 |
from langchain_community.llms import HuggingFaceHub
|
| 36 |
from langchain_community.document_loaders import WebBaseLoader
|
| 37 |
+
from langchain.vectorstores import Chroma
|
| 38 |
+
from langchain.memory import ConversationBufferMemory
|
| 39 |
|
| 40 |
+
from pydantic.v1 import BaseModel, Field
|
|
|
|
|
|
|
|
|
|
| 41 |
|
|
|
|
| 42 |
|
| 43 |
+
"""
|
| 44 |
+
=========================================================
|
| 45 |
+
2) IMAGE CLASSIFICATION MODEL SETUP
|
| 46 |
+
=========================================================
|
| 47 |
+
"""
|
| 48 |
+
# Load a Keras model from HuggingFace Hub
|
| 49 |
model1 = from_pretrained_keras("rocioadlc/efficientnetB0_trash")
|
| 50 |
|
| 51 |
+
# Define class labels for the trash classification
|
| 52 |
class_labels = ['cardboard', 'glass', 'metal', 'paper', 'plastic', 'trash']
|
| 53 |
|
| 54 |
+
def predict_image(input_image):
|
| 55 |
+
"""
|
| 56 |
+
Resize the user-uploaded image and preprocess it so that it can be fed
|
| 57 |
+
into the EfficientNetB0 model. The model then returns a dictionary of
|
| 58 |
+
class probabilities.
|
| 59 |
+
"""
|
| 60 |
+
# Resize the image (note the target dimensions)
|
| 61 |
+
image_array = tf.keras.preprocessing.image.img_to_array(
|
| 62 |
+
input_image.resize((244, 224))
|
| 63 |
+
)
|
| 64 |
+
# Normalize/prescale the image for EfficientNet
|
| 65 |
image_array = tf.keras.applications.efficientnet.preprocess_input(image_array)
|
| 66 |
+
# Expand the dimensions to create a batch of size 1
|
| 67 |
image_array = tf.expand_dims(image_array, 0)
|
| 68 |
+
# Get predictions
|
| 69 |
predictions = model1.predict(image_array)
|
| 70 |
+
|
| 71 |
+
# Convert predictions into a dictionary {class_label: score}
|
| 72 |
category_scores = {}
|
| 73 |
for i, class_label in enumerate(class_labels):
|
| 74 |
category_scores[class_label] = predictions[0][i].item()
|
| 75 |
|
| 76 |
return category_scores
|
| 77 |
|
| 78 |
+
# Gradio interface for image classification
|
| 79 |
image_gradio_app = gr.Interface(
|
| 80 |
fn=predict_image,
|
| 81 |
inputs=gr.Image(label="Image", sources=['upload', 'webcam'], type="pil"),
|
| 82 |
outputs=[gr.Label(label="Result")],
|
| 83 |
+
title="<span style='color: rgb(243, 239, 224);'>Green Greta</span>",
|
| 84 |
theme=theme
|
| 85 |
)
|
| 86 |
|
|
|
|
| 87 |
|
| 88 |
+
"""
|
| 89 |
+
=========================================================
|
| 90 |
+
3) CHATBOT MODEL SETUP
|
| 91 |
+
=========================================================
|
| 92 |
+
"""
|
| 93 |
+
# 3.1) Define user agent to avoid blocking, etc.
|
| 94 |
user_agent = UserAgent().random
|
| 95 |
header_template = {"User-Agent": user_agent}
|
| 96 |
|
| 97 |
+
# 3.2) List of URLs to load for retrieval
|
| 98 |
+
URLS = [
|
| 99 |
+
"https://www.epa.gov/recycle/frequent-questions-recycling",
|
| 100 |
+
"https://www.whitehorsedc.gov.uk/vale-of-white-horse-district-council/recycling-rubbish-and-waste/lets-get-real-about-recycling/",
|
| 101 |
+
"https://www.teimas.com/blog/13-preguntas-y-respuestas-sobre-la-ley-de-residuos-07-2022",
|
| 102 |
+
"https://www.molok.com/es/blog/gestion-de-residuos-solidos-urbanos-rsu-10-dudas-comunes",
|
| 103 |
+
"https://espanol.epa.gov/espanol/el-reciclaje#valelapena",
|
| 104 |
+
"https://espanol.epa.gov/espanol/preguntas-frecuentes-sobre-reciclado-de-plastico-y-elaboracion-de-abono-vegetal",
|
| 105 |
+
"https://espanol.epa.gov/espanol/consejo-del-dia-como-reciclo-mis",
|
| 106 |
+
"https://espanol.epa.gov/espanol/recursos-para-reciclar-dispositivos-electronicos",
|
| 107 |
+
"https://www.epa.gov/recycle/electronics-donation-and-recycling",
|
| 108 |
+
"https://reducereutilizarecicla.org/que-es-el-reciclaje/",
|
| 109 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/",
|
| 110 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-amarillo/",
|
| 111 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-azul/",
|
| 112 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-verde/",
|
| 113 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-marron-organico/",
|
| 114 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/contenedor-gris-restos/",
|
| 115 |
+
"https://reducereutilizarecicla.org/contenedores-de-reciclaje/punto-limpio/",
|
| 116 |
+
"https://reducereutilizarecicla.org/donde-tirar-auriculares/",
|
| 117 |
+
"https://reducereutilizarecicla.org/donde-tirar-sartenes/",
|
| 118 |
+
"https://reducereutilizarecicla.org/donde-tirar-aceite-usado/",
|
| 119 |
+
"https://reducereutilizarecicla.org/como-se-reciclan-los-envases-tipo-brik/",
|
| 120 |
+
"https://reducereutilizarecicla.org/los-envases-del-verano/",
|
| 121 |
+
"https://reducereutilizarecicla.org/donde-tirar-radiografias/",
|
| 122 |
+
"https://reducereutilizarecicla.org/envases-ecologicos/",
|
| 123 |
+
"https://reducereutilizarecicla.org/donde-tirar-los-restos-de-pintura/",
|
| 124 |
+
"https://reducereutilizarecicla.org/valorizacion-de-residuos/",
|
| 125 |
+
"https://reducereutilizarecicla.org/como-reciclar-pilas/",
|
| 126 |
+
"https://reducereutilizarecicla.org/como-reciclar-capsulas-de-cafe/",
|
| 127 |
+
"https://reducereutilizarecicla.org/reciclando-cd/",
|
| 128 |
+
"https://reducereutilizarecicla.org/donde-tirar-neumaticos/",
|
| 129 |
+
"https://reducereutilizarecicla.org/como-reciclar-una-canasta-de-mimbre/",
|
| 130 |
+
"https://reducereutilizarecicla.org/como-funciona-el-contenedor-amarillo/",
|
| 131 |
+
"https://reducereutilizarecicla.org/donde-se-tiran-los-vapers/",
|
| 132 |
+
"https://reducereutilizarecicla.org/cuanto-tarda-una-bolsa-biodegradable-en-degradarse/",
|
| 133 |
+
"https://reducereutilizarecicla.org/donde-se-reciclan-los-juguetes/",
|
| 134 |
+
"https://reducereutilizarecicla.org/objetos-que-se-pueden-reutilizar/",
|
| 135 |
+
"https://reducereutilizarecicla.org/la-parafina-se-puede-reutilizar/",
|
| 136 |
+
"https://reducereutilizarecicla.org/planta-de-reciclaje-de-papel/",
|
| 137 |
+
"https://reducereutilizarecicla.org/como-saber-si-un-envase-es-reciclable/",
|
| 138 |
+
"https://reducereutilizarecicla.org/reutilizar-vasos-de-vela/",
|
| 139 |
+
"https://reducereutilizarecicla.org/bolsas-frio-calor/",
|
| 140 |
+
"https://reducereutilizarecicla.org/reciclar-y-reutilizar-materiales-de-construccion/",
|
| 141 |
+
"https://reducereutilizarecicla.org/que-es-exactamente-el-pet/",
|
| 142 |
+
"https://reducereutilizarecicla.org/tipos-de-reciclaje/",
|
| 143 |
+
"https://reducereutilizarecicla.org/que-hacer-con-palets-reciclados/",
|
| 144 |
+
"https://reducereutilizarecicla.org/vertederos-controlados/",
|
| 145 |
+
"https://reducereutilizarecicla.org/donde-tirar-escombros/",
|
| 146 |
+
"https://reducereutilizarecicla.org/como-reciclar-los-residuos-de-ps-poliestireno/",
|
| 147 |
+
"https://reducereutilizarecicla.org/tirar-la-basura-sin-bolsas/",
|
| 148 |
+
"https://reducereutilizarecicla.org/tirar-el-palo-de-la-fregona/",
|
| 149 |
+
"https://reducereutilizarecicla.org/la-mejor-manera-de-reciclar-una-pala-de-padel/",
|
| 150 |
+
"https://reducereutilizarecicla.org/sabes-donde-tirar-las-llantas-viejas-de-un-coche/",
|
| 151 |
+
"https://reducereutilizarecicla.org/sabes-donde-tirar-el-arbol-de-navidad/",
|
| 152 |
+
"https://reducereutilizarecicla.org/clavos-tornillos-herramientas-donde-tirar-hierro/",
|
| 153 |
+
"https://reducereutilizarecicla.org/donde-tirar-un-secador-de-pelo-contenedor-o-punto-limpio/",
|
| 154 |
+
"https://reducereutilizarecicla.org/donde-tirar-electrodomesticos/",
|
| 155 |
+
"https://reducereutilizarecicla.org/donde-puedo-tirar-ramas-de-arboles/",
|
| 156 |
+
"https://reducereutilizarecicla.org/donde-tirar-escombros/",
|
| 157 |
+
"https://reducereutilizarecicla.org/donde-se-tira-el-muerdago-quemado/",
|
| 158 |
+
"https://reducereutilizarecicla.org/sandalias-caucho-reciclado-neumaticos/",
|
| 159 |
+
"https://reducereutilizarecicla.org/ideas-para-reciclar-aspas-de-ventilador-de-techo/",
|
| 160 |
+
"https://reducereutilizarecicla.org/reciclar-sacos-dormir/",
|
| 161 |
+
"https://reducereutilizarecicla.org/reciclar-sillas-playa/",
|
| 162 |
+
"https://reducereutilizarecicla.org/donde-tirar-antipolillas/",
|
| 163 |
+
"https://reducereutilizarecicla.org/que-hacer-con-los-juguetes-viejos/",
|
| 164 |
+
"https://reducereutilizarecicla.org/como-utilizar-las-mascarillas-y-el-gel-hidroalcoholico-en-la-playa/",
|
| 165 |
+
"https://reducereutilizarecicla.org/ideas-para-reciclar-un-ventilador-de-pie/",
|
| 166 |
+
"https://reducereutilizarecicla.org/donde-tirar-gasoil/",
|
| 167 |
+
"https://reducereutilizarecicla.org/donde-puedo-tirar-basura-electronica/",
|
| 168 |
+
"https://reducereutilizarecicla.org/donde-tirar-agujas/",
|
| 169 |
+
"https://reducereutilizarecicla.org/donde-tirar-residuos-peligrosos/",
|
| 170 |
+
"https://reducereutilizarecicla.org/donde-tirar-los-cables/",
|
| 171 |
+
"https://reducereutilizarecicla.org/donde-tirar-bicicletas/",
|
| 172 |
+
"https://reducereutilizarecicla.org/donde-tirar-maletas/",
|
| 173 |
+
"https://reducereutilizarecicla.org/como-reciclar-una-pantalla/",
|
| 174 |
+
"https://reducereutilizarecicla.org/donde-tirar-ropa-usada/"
|
| 175 |
+
]
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
@tenacity.retry(
|
| 179 |
+
wait=tenacity.wait_fixed(3), # wait 3 seconds between retries
|
| 180 |
+
stop=tenacity.stop_after_attempt(3), # stop after 3 attempts
|
| 181 |
+
reraise=True
|
| 182 |
)
|
| 183 |
+
def load_url(url):
|
| 184 |
+
"""
|
| 185 |
+
Use the WebBaseLoader for a single URL.
|
| 186 |
+
The function is retried if it fails due to connection issues.
|
| 187 |
+
"""
|
| 188 |
+
loader = WebBaseLoader(
|
| 189 |
+
web_paths=[url],
|
| 190 |
+
header_template=header_template
|
| 191 |
+
)
|
| 192 |
+
return loader.load()
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def safe_load_all_urls(urls):
|
| 196 |
+
"""
|
| 197 |
+
Safely load documents from a list of URLs.
|
| 198 |
+
Any URL that fails after the specified number of retries is skipped.
|
| 199 |
+
"""
|
| 200 |
+
all_docs = []
|
| 201 |
+
for link in urls:
|
| 202 |
+
try:
|
| 203 |
+
docs = load_url(link)
|
| 204 |
+
all_docs.extend(docs)
|
| 205 |
+
except Exception as e:
|
| 206 |
+
# If load_url fails after all retries, skip that URL
|
| 207 |
+
print(f"Skipping URL due to error: {link}\nError: {e}\n")
|
| 208 |
+
return all_docs
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
# 3.3) Actually load the data from all URLs
|
| 212 |
+
all_loaded_docs = safe_load_all_urls(URLS)
|
| 213 |
+
|
| 214 |
+
# 3.4) Split the documents into manageable chunks
|
| 215 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 216 |
chunk_size=1024,
|
| 217 |
chunk_overlap=150,
|
| 218 |
length_function=len
|
| 219 |
)
|
| 220 |
+
docs = text_splitter.split_documents(all_loaded_docs)
|
| 221 |
+
|
| 222 |
+
# 3.5) Create embeddings
|
| 223 |
embeddings = HuggingFaceEmbeddings(model_name='thenlper/gte-small')
|
| 224 |
+
|
| 225 |
+
# 3.6) Create a persistent directory to store vector DB
|
| 226 |
persist_directory = 'docs/chroma/'
|
| 227 |
+
shutil.rmtree(persist_directory, ignore_errors=True) # remove old DB files
|
| 228 |
|
| 229 |
+
# 3.7) Build Chroma vector store
|
|
|
|
| 230 |
vectordb = Chroma.from_documents(
|
| 231 |
documents=docs,
|
| 232 |
embedding=embeddings,
|
| 233 |
persist_directory=persist_directory
|
| 234 |
)
|
|
|
|
|
|
|
| 235 |
|
| 236 |
+
# 3.8) Create a retriever
|
| 237 |
+
retriever = vectordb.as_retriever(
|
| 238 |
+
search_kwargs={"k": 2},
|
| 239 |
+
search_type="mmr"
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
"""
|
| 244 |
+
=========================================================
|
| 245 |
+
4) PROMPT & CHAIN SETUP
|
| 246 |
+
=========================================================
|
| 247 |
+
"""
|
| 248 |
+
# 4.1) Define the schema for final chatbot answers
|
| 249 |
class FinalAnswer(BaseModel):
|
| 250 |
question: str = Field()
|
| 251 |
answer: str = Field()
|
| 252 |
|
|
|
|
| 253 |
parser = PydanticOutputParser(pydantic_object=FinalAnswer)
|
| 254 |
|
| 255 |
+
# 4.2) Prompt template: system instructions
|
| 256 |
template = """
|
| 257 |
Your name is Greta and you are a recycling chatbot with the objective to anwer questions from user in English or Spanish /
|
| 258 |
Has sido diseñado y creado por el Grupo 1 del Máster en Data Science & Big Data de la promoción 2023/2024 de la Universidad Complutense de Madrid. Este grupo está fromado por Rocío, María Guillermo, Alejandra, Paloma y Álvaro /
|
|
|
|
| 266 |
{format_instructions}
|
| 267 |
"""
|
| 268 |
|
|
|
|
| 269 |
sys_prompt = SystemMessagePromptTemplate.from_template(template)
|
| 270 |
qa_prompt = ChatPromptTemplate(
|
| 271 |
messages=[
|
| 272 |
sys_prompt,
|
| 273 |
+
HumanMessagePromptTemplate.from_template("{question}")
|
| 274 |
+
],
|
| 275 |
partial_variables={"format_instructions": parser.get_format_instructions()}
|
| 276 |
)
|
| 277 |
+
|
| 278 |
+
# 4.3) Define the LLM from HuggingFace
|
| 279 |
llm = HuggingFaceHub(
|
| 280 |
repo_id="mistralai/Mixtral-8x7B-Instruct-v0.1",
|
| 281 |
task="text-generation",
|
|
|
|
| 287 |
},
|
| 288 |
)
|
| 289 |
|
| 290 |
+
# 4.4) Create a ConversationalRetrievalChain that uses the above LLM
|
| 291 |
qa_chain = ConversationalRetrievalChain.from_llm(
|
| 292 |
+
llm=llm,
|
| 293 |
+
memory=ConversationBufferMemory(
|
| 294 |
+
llm=llm,
|
| 295 |
+
memory_key="chat_history",
|
| 296 |
+
input_key='question',
|
| 297 |
+
output_key='output'
|
| 298 |
+
),
|
| 299 |
+
retriever=retriever,
|
| 300 |
+
verbose=True,
|
| 301 |
combine_docs_chain_kwargs={'prompt': qa_prompt},
|
| 302 |
+
get_chat_history=lambda h : h, # pass memory directly
|
| 303 |
+
rephrase_question=False,
|
| 304 |
+
output_key='output'
|
| 305 |
)
|
| 306 |
|
| 307 |
+
|
| 308 |
+
def chat_interface(question, history):
|
| 309 |
+
"""
|
| 310 |
+
This function processes the user's question through the qa_chain,
|
| 311 |
+
then parses out the final answer from the chain's output.
|
| 312 |
+
"""
|
| 313 |
result = qa_chain.invoke({'question': question})
|
| 314 |
output_string = result['output']
|
| 315 |
|
| 316 |
+
# Find the index of the last occurrence of '"answer":' in the string
|
| 317 |
answer_index = output_string.rfind('"answer":')
|
|
|
|
|
|
|
| 318 |
answer_part = output_string[answer_index + len('"answer":'):].strip()
|
| 319 |
|
| 320 |
# Find the next occurrence of a double quote to get the start of the answer value
|
| 321 |
quote_index = answer_part.find('"')
|
|
|
|
|
|
|
| 322 |
answer_value = answer_part[quote_index + 1:answer_part.find('"', quote_index + 1)]
|
| 323 |
|
| 324 |
return answer_value
|
| 325 |
|
| 326 |
|
| 327 |
+
# Gradio chat interface for the chatbot
|
| 328 |
chatbot_gradio_app = gr.ChatInterface(
|
| 329 |
fn=chat_interface,
|
| 330 |
+
title="<span style='color: rgb(243, 239, 224);'>Green Greta</span>"
|
| 331 |
)
|
| 332 |
|
| 333 |
+
|
| 334 |
+
"""
|
| 335 |
+
=========================================================
|
| 336 |
+
5) BANNER / WELCOME TAB
|
| 337 |
+
=========================================================
|
| 338 |
+
"""
|
| 339 |
banner_tab_content = """
|
| 340 |
<div style="background-color: #d3e3c3; text-align: center; padding: 20px; display: flex; flex-direction: column; align-items: center;">
|
| 341 |
<img src="https://huggingface.co/spaces/ALVHB95/TFM_DataScience_APP/resolve/main/front_4.jpg" alt="Banner Image" style="width: 50%; max-width: 500px; margin: 0 auto;">
|
|
|
|
| 359 |
"""
|
| 360 |
banner_tab = gr.Markdown(banner_tab_content)
|
| 361 |
|
| 362 |
+
|
| 363 |
+
"""
|
| 364 |
+
=========================================================
|
| 365 |
+
6) GRADIO FINAL APP: TABS
|
| 366 |
+
=========================================================
|
| 367 |
+
"""
|
| 368 |
app = gr.TabbedInterface(
|
| 369 |
[banner_tab, image_gradio_app, chatbot_gradio_app],
|
| 370 |
tab_names=["Welcome to Green Greta", "Green Greta Image Classification", "Green Greta Chat"],
|
| 371 |
theme=theme
|
| 372 |
)
|
| 373 |
|
| 374 |
+
# Enable queue() for concurrency and launch the Gradio app
|
| 375 |
app.queue()
|
| 376 |
+
app.launch()
|