Switched to LCEL
Browse files- climateqa/engine/__init__.py +0 -0
- climateqa/engine/embeddings.py +24 -0
- climateqa/{llm.py β engine/llm.py} +0 -0
- climateqa/{chains.py β engine/old/chains.py} +0 -26
- climateqa/{chat.py β engine/old/chat.py} +0 -0
- climateqa/{custom_retrieval_chain.py β engine/old/custom_retrieval_chain.py} +0 -0
- climateqa/{prompts.py β engine/prompts.py} +20 -11
- climateqa/engine/rag.py +64 -0
- climateqa/engine/reformulation.py +28 -0
- climateqa/{retriever.py β engine/retriever.py} +19 -9
- climateqa/engine/utils.py +50 -0
- climateqa/engine/vectorstore.py +44 -0
- climateqa/parser/__init__.py +0 -0
climateqa/engine/__init__.py
ADDED
|
File without changes
|
climateqa/engine/embeddings.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
| 3 |
+
from langchain.embeddings import HuggingFaceEmbeddings
|
| 4 |
+
|
| 5 |
+
def get_embeddings_function(version = "v1.2"):
|
| 6 |
+
|
| 7 |
+
if version == "v1.2":
|
| 8 |
+
|
| 9 |
+
# https://huggingface.co/BAAI/bge-base-en-v1.5
|
| 10 |
+
# Best embedding model at a reasonable size at the moment (2023-11-22)
|
| 11 |
+
|
| 12 |
+
model_name = "BAAI/bge-base-en-v1.5"
|
| 13 |
+
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
|
| 14 |
+
embeddings_function = HuggingFaceBgeEmbeddings(
|
| 15 |
+
model_name=model_name,
|
| 16 |
+
encode_kwargs=encode_kwargs,
|
| 17 |
+
query_instruction="Represent this sentence for searching relevant passages: "
|
| 18 |
+
)
|
| 19 |
+
|
| 20 |
+
else:
|
| 21 |
+
|
| 22 |
+
embeddings_function = HuggingFaceEmbeddings(model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1")
|
| 23 |
+
|
| 24 |
+
return embeddings_function
|
climateqa/{llm.py β engine/llm.py}
RENAMED
|
File without changes
|
climateqa/{chains.py β engine/old/chains.py}
RENAMED
|
@@ -10,32 +10,6 @@ from langchain.chains.qa_with_sources import load_qa_with_sources_chain
|
|
| 10 |
from climateqa.prompts import answer_prompt, reformulation_prompt,audience_prompts
|
| 11 |
from climateqa.custom_retrieval_chain import CustomRetrievalQAWithSourcesChain
|
| 12 |
|
| 13 |
-
def load_reformulation_chain(llm):
|
| 14 |
-
|
| 15 |
-
prompt = PromptTemplate(
|
| 16 |
-
template = reformulation_prompt,
|
| 17 |
-
input_variables=["query"],
|
| 18 |
-
)
|
| 19 |
-
reformulation_chain = LLMChain(llm = llm,prompt = prompt,output_key="json")
|
| 20 |
-
|
| 21 |
-
# Parse the output
|
| 22 |
-
def parse_output(output):
|
| 23 |
-
query = output["query"]
|
| 24 |
-
json_output = json.loads(output["json"])
|
| 25 |
-
question = json_output.get("question", query)
|
| 26 |
-
language = json_output.get("language", "English")
|
| 27 |
-
return {
|
| 28 |
-
"question": question,
|
| 29 |
-
"language": language,
|
| 30 |
-
}
|
| 31 |
-
|
| 32 |
-
transform_chain = TransformChain(
|
| 33 |
-
input_variables=["json"], output_variables=["question","language"], transform=parse_output
|
| 34 |
-
)
|
| 35 |
-
|
| 36 |
-
reformulation_chain = SequentialChain(chains = [reformulation_chain,transform_chain],input_variables=["query"],output_variables=["question","language"])
|
| 37 |
-
return reformulation_chain
|
| 38 |
-
|
| 39 |
|
| 40 |
def load_combine_documents_chain(llm):
|
| 41 |
prompt = PromptTemplate(template=answer_prompt, input_variables=["summaries", "question","audience","language"])
|
|
|
|
| 10 |
from climateqa.prompts import answer_prompt, reformulation_prompt,audience_prompts
|
| 11 |
from climateqa.custom_retrieval_chain import CustomRetrievalQAWithSourcesChain
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def load_combine_documents_chain(llm):
|
| 15 |
prompt = PromptTemplate(template=answer_prompt, input_variables=["summaries", "question","audience","language"])
|
climateqa/{chat.py β engine/old/chat.py}
RENAMED
|
File without changes
|
climateqa/{custom_retrieval_chain.py β engine/old/custom_retrieval_chain.py}
RENAMED
|
File without changes
|
climateqa/{prompts.py β engine/prompts.py}
RENAMED
|
@@ -1,33 +1,42 @@
|
|
| 1 |
|
| 2 |
# If the message is not relevant to climate change (like "How are you", "I am 18 years old" or "When was built the eiffel tower"), return N/A
|
| 3 |
|
| 4 |
-
|
| 5 |
Reformulate the following user message to be a short standalone question in English, in the context of an educational discussion about climate change.
|
| 6 |
---
|
| 7 |
query: La technologie nous sauvera-t-elle ?
|
| 8 |
-
|
| 9 |
-
|
|
|
|
| 10 |
---
|
| 11 |
query: what are our reserves in fossil fuel?
|
| 12 |
-
|
| 13 |
-
|
|
|
|
| 14 |
---
|
| 15 |
query: what are the main causes of climate change?
|
| 16 |
-
|
| 17 |
-
|
|
|
|
| 18 |
---
|
| 19 |
|
|
|
|
|
|
|
|
|
|
| 20 |
Output the result as json with two keys "question" and "language"
|
| 21 |
query: {query}
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
You are ClimateQ&A, an AI Assistant created by Ekimetrics, you will act as a climate scientist and answer questions about climate change and biodiversity.
|
| 26 |
You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
|
| 27 |
"""
|
| 28 |
|
| 29 |
|
| 30 |
-
|
| 31 |
You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
|
| 32 |
|
| 33 |
Guidelines:
|
|
@@ -42,7 +51,7 @@ Guidelines:
|
|
| 42 |
|
| 43 |
-----------------------
|
| 44 |
Passages:
|
| 45 |
-
{
|
| 46 |
|
| 47 |
-----------------------
|
| 48 |
Question: {question} - Explained to {audience}
|
|
|
|
| 1 |
|
| 2 |
# If the message is not relevant to climate change (like "How are you", "I am 18 years old" or "When was built the eiffel tower"), return N/A
|
| 3 |
|
| 4 |
+
reformulation_prompt_template = """
|
| 5 |
Reformulate the following user message to be a short standalone question in English, in the context of an educational discussion about climate change.
|
| 6 |
---
|
| 7 |
query: La technologie nous sauvera-t-elle ?
|
| 8 |
+
->
|
| 9 |
+
'question': 'Can technology help humanity mitigate the effects of climate change?',
|
| 10 |
+
'language': 'French',
|
| 11 |
---
|
| 12 |
query: what are our reserves in fossil fuel?
|
| 13 |
+
->
|
| 14 |
+
'question': 'What are the current reserves of fossil fuels and how long will they last?',
|
| 15 |
+
'language': 'English',
|
| 16 |
---
|
| 17 |
query: what are the main causes of climate change?
|
| 18 |
+
->
|
| 19 |
+
'question': 'What are the main causes of climate change in the last century?',
|
| 20 |
+
'language': 'English'
|
| 21 |
---
|
| 22 |
|
| 23 |
+
{format_instructions}
|
| 24 |
+
|
| 25 |
+
Reformulate the question in English and detect the language of the original message
|
| 26 |
Output the result as json with two keys "question" and "language"
|
| 27 |
query: {query}
|
| 28 |
+
->
|
| 29 |
+
```json
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
|
| 33 |
+
system_prompt_template = """
|
| 34 |
You are ClimateQ&A, an AI Assistant created by Ekimetrics, you will act as a climate scientist and answer questions about climate change and biodiversity.
|
| 35 |
You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
|
| 36 |
"""
|
| 37 |
|
| 38 |
|
| 39 |
+
answer_prompt_template = """
|
| 40 |
You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a question and extracted passages of the IPCC and/or IPBES reports. Provide a clear and structured answer based on the passages provided, the context and the guidelines.
|
| 41 |
|
| 42 |
Guidelines:
|
|
|
|
| 51 |
|
| 52 |
-----------------------
|
| 53 |
Passages:
|
| 54 |
+
{context}
|
| 55 |
|
| 56 |
-----------------------
|
| 57 |
Question: {question} - Explained to {audience}
|
climateqa/engine/rag.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from operator import itemgetter
|
| 2 |
+
|
| 3 |
+
from langchain.prompts import ChatPromptTemplate
|
| 4 |
+
from langchain.schema.output_parser import StrOutputParser
|
| 5 |
+
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
|
| 6 |
+
from langchain.prompts.prompt import PromptTemplate
|
| 7 |
+
from langchain.schema import format_document
|
| 8 |
+
|
| 9 |
+
from climateqa.engine.reformulation import make_reformulation_chain
|
| 10 |
+
from climateqa.engine.prompts import answer_prompt_template
|
| 11 |
+
from climateqa.engine.utils import pass_values, flatten_dict
|
| 12 |
+
|
| 13 |
+
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
|
| 14 |
+
|
| 15 |
+
def _combine_documents(
|
| 16 |
+
docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, sep="\n\n"
|
| 17 |
+
):
|
| 18 |
+
doc_strings = [f"Doc {i+1}: " + format_document(doc, document_prompt) for i,doc in enumerate(docs)]
|
| 19 |
+
return sep.join(doc_strings)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def make_rag_chain(retriever,llm):
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Construct the prompt
|
| 26 |
+
prompt = ChatPromptTemplate.from_template(answer_prompt_template)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# ------- CHAIN 0 - Reformulation
|
| 30 |
+
reformulation_chain = make_reformulation_chain(llm)
|
| 31 |
+
reformulation = (
|
| 32 |
+
{"reformulation":reformulation_chain,**pass_values(["audience","query"])}
|
| 33 |
+
| RunnablePassthrough()
|
| 34 |
+
| flatten_dict
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ------- CHAIN 1
|
| 39 |
+
# Retrieved documents
|
| 40 |
+
find_documents = {
|
| 41 |
+
"docs": itemgetter("question") | retriever,
|
| 42 |
+
**pass_values(["question","audience","language","query"])
|
| 43 |
+
} | RunnablePassthrough()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# ------- CHAIN 2
|
| 47 |
+
# Construct inputs for the llm
|
| 48 |
+
input_documents = {
|
| 49 |
+
"context":lambda x : _combine_documents(x["docs"]),
|
| 50 |
+
**pass_values(["question","audience","language"])
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
# Generate the answer
|
| 54 |
+
answer = {
|
| 55 |
+
"answer": input_documents | prompt | llm | StrOutputParser(),
|
| 56 |
+
**pass_values(["question","audience","language","query","docs"])
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# ------- FINAL CHAIN
|
| 60 |
+
# Build the final chain
|
| 61 |
+
rag_chain = reformulation | find_documents | answer
|
| 62 |
+
|
| 63 |
+
return rag_chain
|
| 64 |
+
|
climateqa/engine/reformulation.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
|
| 3 |
+
from langchain.prompts import PromptTemplate
|
| 4 |
+
from langchain.llms import OpenAI
|
| 5 |
+
from langchain.chat_models import ChatOpenAI
|
| 6 |
+
|
| 7 |
+
from climateqa.engine.prompts import reformulation_prompt_template
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
response_schemas = [
|
| 12 |
+
ResponseSchema(name="language", description="The detected language of the input message"),
|
| 13 |
+
ResponseSchema(name="question", description="The reformulated question always in English")
|
| 14 |
+
]
|
| 15 |
+
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
|
| 16 |
+
format_instructions = output_parser.get_format_instructions()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def make_reformulation_chain(llm):
|
| 20 |
+
|
| 21 |
+
prompt = PromptTemplate(
|
| 22 |
+
template=reformulation_prompt_template,
|
| 23 |
+
input_variables=["query"],
|
| 24 |
+
partial_variables={"format_instructions": format_instructions}
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
chain = (prompt | llm.bind(stop=["```"]) | output_parser)
|
| 28 |
+
return chain
|
climateqa/{retriever.py β engine/retriever.py}
RENAMED
|
@@ -12,12 +12,16 @@ from pydantic import Field
|
|
| 12 |
class ClimateQARetriever(BaseRetriever):
|
| 13 |
vectorstore:VectorStore
|
| 14 |
sources:list = ["IPCC","IPBES"]
|
| 15 |
-
|
|
|
|
| 16 |
k_summary:int = 3
|
| 17 |
k_total:int = 10
|
| 18 |
namespace:str = "vectors"
|
| 19 |
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
# Check if all elements in the list are either IPCC or IPBES
|
| 23 |
assert isinstance(self.sources,list)
|
|
@@ -25,16 +29,20 @@ class ClimateQARetriever(BaseRetriever):
|
|
| 25 |
assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
|
| 26 |
|
| 27 |
# Prepare base search kwargs
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# Search for k_summary documents in the summaries dataset
|
| 33 |
filters_summaries = {
|
| 34 |
**filters,
|
| 35 |
"report_type": { "$in":["SPM","TS"]},
|
| 36 |
}
|
| 37 |
-
|
|
|
|
| 38 |
docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
|
| 39 |
|
| 40 |
# Search for k_total - k_summary documents in the full reports dataset
|
|
@@ -43,7 +51,7 @@ class ClimateQARetriever(BaseRetriever):
|
|
| 43 |
"report_type": { "$nin":["SPM","TS"]},
|
| 44 |
}
|
| 45 |
k_full = self.k_total - len(docs_summaries)
|
| 46 |
-
docs_full = self.vectorstore.similarity_search_with_score(query=query,
|
| 47 |
|
| 48 |
# Concatenate documents
|
| 49 |
docs = docs_summaries + docs_full
|
|
@@ -57,11 +65,13 @@ class ClimateQARetriever(BaseRetriever):
|
|
| 57 |
doc.metadata["similarity_score"] = score
|
| 58 |
doc.metadata["content"] = doc.page_content
|
| 59 |
doc.metadata["page_number"] = int(doc.metadata["page_number"])
|
| 60 |
-
doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
|
| 61 |
results.append(doc)
|
| 62 |
|
| 63 |
-
|
|
|
|
| 64 |
|
|
|
|
| 65 |
|
| 66 |
|
| 67 |
|
|
|
|
| 12 |
class ClimateQARetriever(BaseRetriever):
|
| 13 |
vectorstore:VectorStore
|
| 14 |
sources:list = ["IPCC","IPBES"]
|
| 15 |
+
reports:list = []
|
| 16 |
+
threshold:float = 0.4
|
| 17 |
k_summary:int = 3
|
| 18 |
k_total:int = 10
|
| 19 |
namespace:str = "vectors"
|
| 20 |
|
| 21 |
+
|
| 22 |
+
def _get_relevant_documents(
|
| 23 |
+
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
| 24 |
+
) -> List[Document]:
|
| 25 |
|
| 26 |
# Check if all elements in the list are either IPCC or IPBES
|
| 27 |
assert isinstance(self.sources,list)
|
|
|
|
| 29 |
assert self.k_total > self.k_summary, "k_total should be greater than k_summary"
|
| 30 |
|
| 31 |
# Prepare base search kwargs
|
| 32 |
+
|
| 33 |
+
filters = {}
|
| 34 |
+
if len(self.reports) > 0:
|
| 35 |
+
filters["short_name"] = {"$in":self.reports}
|
| 36 |
+
else:
|
| 37 |
+
filters["source"] = { "$in":self.sources}
|
| 38 |
|
| 39 |
# Search for k_summary documents in the summaries dataset
|
| 40 |
filters_summaries = {
|
| 41 |
**filters,
|
| 42 |
"report_type": { "$in":["SPM","TS"]},
|
| 43 |
}
|
| 44 |
+
|
| 45 |
+
docs_summaries = self.vectorstore.similarity_search_with_score(query=query,filter = filters_summaries,k = self.k_summary)
|
| 46 |
docs_summaries = [x for x in docs_summaries if x[1] > self.threshold]
|
| 47 |
|
| 48 |
# Search for k_total - k_summary documents in the full reports dataset
|
|
|
|
| 51 |
"report_type": { "$nin":["SPM","TS"]},
|
| 52 |
}
|
| 53 |
k_full = self.k_total - len(docs_summaries)
|
| 54 |
+
docs_full = self.vectorstore.similarity_search_with_score(query=query,filter = filters_full,k = k_full)
|
| 55 |
|
| 56 |
# Concatenate documents
|
| 57 |
docs = docs_summaries + docs_full
|
|
|
|
| 65 |
doc.metadata["similarity_score"] = score
|
| 66 |
doc.metadata["content"] = doc.page_content
|
| 67 |
doc.metadata["page_number"] = int(doc.metadata["page_number"])
|
| 68 |
+
# doc.page_content = f"""Doc {i+1} - {doc.metadata['short_name']}: {doc.page_content}"""
|
| 69 |
results.append(doc)
|
| 70 |
|
| 71 |
+
# Sort by score
|
| 72 |
+
# results = sorted(results,key = lambda x : x.metadata["similarity_score"],reverse = True)
|
| 73 |
|
| 74 |
+
return results
|
| 75 |
|
| 76 |
|
| 77 |
|
climateqa/engine/utils.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from typing import Any, Dict, Iterable, Tuple, Union
|
| 3 |
+
from operator import itemgetter
|
| 4 |
+
|
| 5 |
+
def pass_values(x):
|
| 6 |
+
if not isinstance(x,list): x = [x]
|
| 7 |
+
return {k:itemgetter(k) for k in x}
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
# Drawn from langchain utils and modified to remove the parent key
|
| 11 |
+
def _flatten_dict(
|
| 12 |
+
nested_dict: Dict[str, Any], parent_key: str = "", sep: str = "_"
|
| 13 |
+
) -> Iterable[Tuple[str, Any]]:
|
| 14 |
+
"""
|
| 15 |
+
Generator that yields flattened items from a nested dictionary for a flat dict.
|
| 16 |
+
|
| 17 |
+
Parameters:
|
| 18 |
+
nested_dict (dict): The nested dictionary to flatten.
|
| 19 |
+
parent_key (str): The prefix to prepend to the keys of the flattened dict.
|
| 20 |
+
sep (str): The separator to use between the parent key and the key of the
|
| 21 |
+
flattened dictionary.
|
| 22 |
+
|
| 23 |
+
Yields:
|
| 24 |
+
(str, any): A key-value pair from the flattened dictionary.
|
| 25 |
+
"""
|
| 26 |
+
for key, value in nested_dict.items():
|
| 27 |
+
new_key = key
|
| 28 |
+
if isinstance(value, dict):
|
| 29 |
+
yield from _flatten_dict(value, new_key, sep)
|
| 30 |
+
else:
|
| 31 |
+
yield new_key, value
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def flatten_dict(
|
| 35 |
+
nested_dict: Dict[str, Any], parent_key: str = "", sep: str = "_"
|
| 36 |
+
) -> Dict[str, Any]:
|
| 37 |
+
"""Flattens a nested dictionary into a flat dictionary.
|
| 38 |
+
|
| 39 |
+
Parameters:
|
| 40 |
+
nested_dict (dict): The nested dictionary to flatten.
|
| 41 |
+
parent_key (str): The prefix to prepend to the keys of the flattened dict.
|
| 42 |
+
sep (str): The separator to use between the parent key and the key of the
|
| 43 |
+
flattened dictionary.
|
| 44 |
+
|
| 45 |
+
Returns:
|
| 46 |
+
(dict): A flat dictionary.
|
| 47 |
+
|
| 48 |
+
"""
|
| 49 |
+
flat_dict = {k: v for k, v in _flatten_dict(nested_dict, parent_key, sep)}
|
| 50 |
+
return flat_dict
|
climateqa/engine/vectorstore.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Pinecone
|
| 2 |
+
# More info at https://docs.pinecone.io/docs/langchain
|
| 3 |
+
# And https://python.langchain.com/docs/integrations/vectorstores/pinecone
|
| 4 |
+
import os
|
| 5 |
+
import pinecone
|
| 6 |
+
from langchain.vectorstores import Pinecone
|
| 7 |
+
|
| 8 |
+
# LOAD ENVIRONMENT VARIABLES
|
| 9 |
+
try:
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
load_dotenv()
|
| 12 |
+
except:
|
| 13 |
+
pass
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_pinecone_vectorstore(embeddings,text_key = "text"):
|
| 17 |
+
|
| 18 |
+
# initialize pinecone
|
| 19 |
+
pinecone.init(
|
| 20 |
+
api_key=os.getenv("PINECONE_API_KEY"), # find at app.pinecone.io
|
| 21 |
+
environment=os.getenv("PINECONE_API_ENVIRONMENT"), # next to api key in console
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
index_name = os.getenv("PINECONE_API_INDEX")
|
| 25 |
+
vectorstore = Pinecone.from_existing_index(index_name, embeddings,text_key = text_key)
|
| 26 |
+
return vectorstore
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
# def get_pinecone_retriever(vectorstore,k = 10,namespace = "vectors",sources = ["IPBES","IPCC"]):
|
| 30 |
+
|
| 31 |
+
# assert isinstance(sources,list)
|
| 32 |
+
|
| 33 |
+
# # Check if all elements in the list are either IPCC or IPBES
|
| 34 |
+
# filter = {
|
| 35 |
+
# "source": { "$in":sources},
|
| 36 |
+
# }
|
| 37 |
+
|
| 38 |
+
# retriever = vectorstore.as_retriever(search_kwargs={
|
| 39 |
+
# "k": k,
|
| 40 |
+
# "namespace":"vectors",
|
| 41 |
+
# "filter":filter
|
| 42 |
+
# })
|
| 43 |
+
|
| 44 |
+
# return retriever
|
climateqa/parser/__init__.py
ADDED
|
File without changes
|