Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

TheoLvs commited on Nov 27, 2023

Commit

18f4541

1 Parent(s): 75cb294

Switched to LCEL

Browse files

Files changed (13) hide show

climateqa/engine/__init__.py +0 -0
climateqa/engine/embeddings.py +24 -0
climateqa/{llm.py → engine/llm.py} +0 -0
climateqa/{chains.py → engine/old/chains.py} +0 -26
climateqa/{chat.py → engine/old/chat.py} +0 -0
climateqa/{custom_retrieval_chain.py → engine/old/custom_retrieval_chain.py} +0 -0
climateqa/{prompts.py → engine/prompts.py} +20 -11
climateqa/engine/rag.py +64 -0
climateqa/engine/reformulation.py +28 -0
climateqa/{retriever.py → engine/retriever.py} +19 -9
climateqa/engine/utils.py +50 -0
climateqa/engine/vectorstore.py +44 -0
climateqa/parser/__init__.py +0 -0

climateqa/engine/__init__.py ADDED Viewed

File without changes

climateqa/engine/embeddings.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from langchain.embeddings import HuggingFaceBgeEmbeddings
+from langchain.embeddings import HuggingFaceEmbeddings
+def get_embeddings_function(version = "v1.2"):
+    if version == "v1.2":
+        # https://huggingface.co/BAAI/bge-base-en-v1.5
+        # Best embedding model at a reasonable size at the moment (2023-11-22)
+        model_name = "BAAI/bge-base-en-v1.5"
+        encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
+        embeddings_function = HuggingFaceBgeEmbeddings(
+            model_name=model_name,
+            encode_kwargs=encode_kwargs,
+            query_instruction="Represent this sentence for searching relevant passages: "
+        )
+    else:
+        embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
+    return embeddings_function

climateqa/{llm.py → engine/llm.py} RENAMED Viewed

File without changes

climateqa/{chains.py → engine/old/chains.py} RENAMED Viewed

@@ -10,32 +10,6 @@ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
 from climateqa.prompts import answer_prompt, reformulation_prompt,audience_prompts
 from climateqa.custom_retrieval_chain import CustomRetrievalQAWithSourcesChain
-def load_reformulation_chain(llm):
-    prompt = PromptTemplate(
-        template = reformulation_prompt,
-        input_variables=["query"],
-    )
-    reformulation_chain = LLMChain(llm = llm,prompt = prompt,output_key="json")
-    # Parse the output
-    def parse_output(output):
-        query = output["query"]
-        json_output = json.loads(output["json"])
-        question = json_output.get("question", query)
-        language = json_output.get("language", "English")
-        return {
-            "question": question,
-            "language": language,
-        }
-    transform_chain = TransformChain(
-        input_variables=["json"], output_variables=["question","language"], transform=parse_output
-    )
-    reformulation_chain = SequentialChain(chains = [reformulation_chain,transform_chain],input_variables=["query"],output_variables=["question","language"])
-    return reformulation_chain
 def load_combine_documents_chain(llm):
     prompt = PromptTemplate(template=answer_prompt, input_variables=["summaries", "question","audience","language"])

 from climateqa.prompts import answer_prompt, reformulation_prompt,audience_prompts
 from climateqa.custom_retrieval_chain import CustomRetrievalQAWithSourcesChain
 def load_combine_documents_chain(llm):
     prompt = PromptTemplate(template=answer_prompt, input_variables=["summaries", "question","audience","language"])

climateqa/{chat.py → engine/old/chat.py} RENAMED Viewed

File without changes

climateqa/{custom_retrieval_chain.py → engine/old/custom_retrieval_chain.py} RENAMED Viewed

File without changes

climateqa/{prompts.py → engine/prompts.py} RENAMED Viewed

@@ -1,33 +1,42 @@
 # If the message is not relevant to climate change (like "How are you", "I am 18 years old" or "When was built the eiffel tower"), return N/A
-reformulation_prompt = """
 Reformulate the following user message to be a short standalone question in English, in the context of an educational discussion about climate change.
 ---
 query: La technologie nous sauvera-t-elle ?
-question: Can technology help humanity mitigate the effects of climate change?
-language: French
 ---
 query: what are our reserves in fossil fuel?
-question: What are the current reserves of fossil fuels and how long will they last?
-language: English
 ---
 query: what are the main causes of climate change?
-question: What are the main causes of climate change in the last century?
-language: English
 ---
 Output the result as json with two keys "question" and "language"
 query: {query}
-answer:"""
-system_prompt = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics, you will act as a climate scientist and answer questions about climate change and biodiversity.
 You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
 """
-answer_prompt = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
 Guidelines:
@@ -42,7 +51,7 @@ Guidelines:
 -----------------------
 Passages:
-{summaries}
 -----------------------
 Question: {question} - Explained to {audience}

 # If the message is not relevant to climate change (like "How are you", "I am 18 years old" or "When was built the eiffel tower"), return N/A
+reformulation_prompt_template = """
 Reformulate the following user message to be a short standalone question in English, in the context of an educational discussion about climate change.
 ---
 query: La technologie nous sauvera-t-elle ?
+->
+'question': 'Can technology help humanity mitigate the effects of climate change?',
+'language': 'French',
 ---
 query: what are our reserves in fossil fuel?
+->
+'question': 'What are the current reserves of fossil fuels and how long will they last?',
+'language': 'English',
 ---
 query: what are the main causes of climate change?
+->
+'question': 'What are the main causes of climate change in the last century?',
+'language': 'English'
 ---
+{format_instructions}
+Reformulate the question in English and detect the language of the original message
 Output the result as json with two keys "question" and "language"
 query: {query}
+->
+```json
+"""
+system_prompt_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics, you will act as a climate scientist and answer questions about climate change and biodiversity.
 You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
 """
+answer_prompt_template = """
 You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
 Guidelines:
 -----------------------
 Passages:
+{context}
 -----------------------
 Question: {question} - Explained to {audience}

climateqa/engine/rag.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from operator import itemgetter
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema.output_parser import StrOutputParser
+from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
+from langchain.prompts.prompt import PromptTemplate
+from langchain.schema import format_document
+from climateqa.engine.reformulation import make_reformulation_chain
+from climateqa.engine.prompts import answer_prompt_template
+from climateqa.engine.utils import pass_values, flatten_dict
+DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
+def _combine_documents(
+    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
+):
+    doc_strings = [f"Doc {i+1}: " + format_document(doc, document_prompt) for i,doc in enumerate(docs)]
+    return sep.join(doc_strings)
+def make_rag_chain(retriever,llm):
+    # Construct the prompt
+    prompt = ChatPromptTemplate.from_template(answer_prompt_template)
+    # ------- CHAIN 0 - Reformulation
+    reformulation_chain = make_reformulation_chain(llm)
+    reformulation = (
+        {"reformulation":reformulation_chain,**pass_values(["audience","query"])}
+        | RunnablePassthrough()
+        | flatten_dict
+    )
+    # ------- CHAIN 1
+    # Retrieved documents
+    find_documents =  {
+        "docs": itemgetter("question") | retriever,
+        **pass_values(["question","audience","language","query"])
+    } | RunnablePassthrough()
+    # ------- CHAIN 2
+    # Construct inputs for the llm
+    input_documents = {
+        "context":lambda x : _combine_documents(x["docs"]),
+        **pass_values(["question","audience","language"])
+    }
+    # Generate the answer
+    answer = {
+        "answer": input_documents | prompt | llm | StrOutputParser(),
+        **pass_values(["question","audience","language","query","docs"])
+    }
+    # ------- FINAL CHAIN
+    # Build the final chain
+    rag_chain = reformulation | find_documents | answer
+    return rag_chain

climateqa/engine/reformulation.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from langchain.output_parsers import StructuredOutputParser, ResponseSchema
+from langchain.prompts import PromptTemplate
+from langchain.llms import OpenAI
+from langchain.chat_models import ChatOpenAI
+from climateqa.engine.prompts import reformulation_prompt_template
+response_schemas = [
+    ResponseSchema(name="language", description="The detected language of the input message"),
+    ResponseSchema(name="question", description="The reformulated question always in English")
+]
+output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
+format_instructions = output_parser.get_format_instructions()
+def make_reformulation_chain(llm):
+    prompt = PromptTemplate(
+        template=reformulation_prompt_template,
+        input_variables=["query"],
+        partial_variables={"format_instructions": format_instructions}
+    )
+    chain = (prompt | llm.bind(stop=["```"]) | output_parser)
+    return chain

climateqa/{retriever.py → engine/retriever.py} RENAMED Viewed

@@ -12,12 +12,16 @@ from pydantic import Field
 class ClimateQARetriever(BaseRetriever):
     vectorstore:VectorStore
     sources:list = ["IPCC","IPBES"]
-    threshold:float = 22
     k_summary:int = 3
     k_total:int = 10
     namespace:str = "vectors"
-    def get_relevant_documents(self, query: str) -> List[Document]:
         # Check if all elements in the list are either IPCC or IPBES
         assert isinstance(self.sources,list)
@@ -25,16 +29,20 @@ class ClimateQARetriever(BaseRetriever):
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
         # Prepare base search kwargs
-        filters = {
-            "source": { "$in":self.sources},
-        }
         # Search for k_summary documents in the summaries dataset
         filters_summaries = {
             **filters,
             "report_type": { "$in":["SPM","TS"]},
         }
-        docs_summaries = self.vectorstore.similarity_search_with_score(query=query,namespace = self.namespace,filter = filters_summaries,k = self.k_summary)
         docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
         # Search for k_total - k_summary documents in the full reports dataset
@@ -43,7 +51,7 @@ class ClimateQARetriever(BaseRetriever):
             "report_type": { "$nin":["SPM","TS"]},
         }
         k_full = self.k_total - len(docs_summaries)
-        docs_full = self.vectorstore.similarity_search_with_score(query=query,namespace = self.namespace,filter = filters_full,k = k_full)
         # Concatenate documents
         docs = docs_summaries + docs_full
@@ -57,11 +65,13 @@ class ClimateQARetriever(BaseRetriever):
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"])
-            doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
             results.append(doc)
-        return results

 class ClimateQARetriever(BaseRetriever):
     vectorstore:VectorStore
     sources:list = ["IPCC","IPBES"]
+    reports:list = []
+    threshold:float = 0.4
     k_summary:int = 3
     k_total:int = 10
     namespace:str = "vectors"
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
         # Check if all elements in the list are either IPCC or IPBES
         assert isinstance(self.sources,list)
         assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
         # Prepare base search kwargs
+        filters = {}
+        if len(self.reports) > 0:
+            filters["short_name"] = {"$in":self.reports}
+        else:
+            filters["source"] = { "$in":self.sources}
         # Search for k_summary documents in the summaries dataset
         filters_summaries = {
             **filters,
             "report_type": { "$in":["SPM","TS"]},
         }
+        docs_summaries = self.vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = self.k_summary)
         docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
         # Search for k_total - k_summary documents in the full reports dataset
             "report_type": { "$nin":["SPM","TS"]},
         }
         k_full = self.k_total - len(docs_summaries)
+        docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
         # Concatenate documents
         docs = docs_summaries + docs_full
             doc.metadata["similarity_score"] = score
             doc.metadata["content"] = doc.page_content
             doc.metadata["page_number"] = int(doc.metadata["page_number"])
+            # doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
             results.append(doc)
+        # Sort by score
+        # results = sorted(results,key = lambda x : x.metadata["similarity_score"],reverse = True)
+        return results

climateqa/engine/utils.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from typing import Any, Dict, Iterable, Tuple, Union
+from operator import itemgetter
+def pass_values(x):
+    if not isinstance(x,list): x = [x]
+    return {k:itemgetter(k) for k in x}
+# Drawn from langchain utils and modified to remove the parent key
+def _flatten_dict(
+    nested_dict: Dict[str, Any], parent_key: str = "", sep: str = "_"
+) -> Iterable[Tuple[str, Any]]:
+    """
+    Generator that yields flattened items from a nested dictionary for a flat dict.
+    Parameters:
+        nested_dict (dict): The nested dictionary to flatten.
+        parent_key (str): The prefix to prepend to the keys of the flattened dict.
+        sep (str): The separator to use between the parent key and the key of the
+            flattened dictionary.
+    Yields:
+        (str, any): A key-value pair from the flattened dictionary.
+    """
+    for key, value in nested_dict.items():
+        new_key = key
+        if isinstance(value, dict):
+            yield from _flatten_dict(value, new_key, sep)
+        else:
+            yield new_key, value
+def flatten_dict(
+    nested_dict: Dict[str, Any], parent_key: str = "", sep: str = "_"
+) -> Dict[str, Any]:
+    """Flattens a nested dictionary into a flat dictionary.
+    Parameters:
+        nested_dict (dict): The nested dictionary to flatten.
+        parent_key (str): The prefix to prepend to the keys of the flattened dict.
+        sep (str): The separator to use between the parent key and the key of the
+            flattened dictionary.
+    Returns:
+        (dict): A flat dictionary.
+    """
+    flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
+    return flat_dict

climateqa/engine/vectorstore.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# Pinecone
+# More info at https://docs.pinecone.io/docs/langchain
+# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
+import os
+import pinecone
+from langchain.vectorstores import Pinecone
+# LOAD ENVIRONMENT VARIABLES
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except:
+    pass
+def get_pinecone_vectorstore(embeddings,text_key = "text"):
+    # initialize pinecone
+    pinecone.init(
+        api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
+        environment=os.getenv("PINECONE_API_ENVIRONMENT"),  # next to api key in console
+    )
+    index_name = os.getenv("PINECONE_API_INDEX")
+    vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
+    return vectorstore
+# def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
+#     assert isinstance(sources,list)
+#     # Check if all elements in the list are either IPCC or IPBES
+#     filter = {
+#         "source": { "$in":sources},
+#     }
+#     retriever = vectorstore.as_retriever(search_kwargs={
+#         "k": k,
+#         "namespace":"vectors",
+#         "filter":filter
+#     })
+#     return retriever

climateqa/parser/__init__.py ADDED Viewed

File without changes