Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

App Files Files Community

nickmuchi commited on Feb 5, 2023

Commit

08e6e30

1 Parent(s): d3132eb

Update functions.py

Browse files

Files changed (1) hide show

functions.py +117 -2

functions.py CHANGED Viewed

@@ -21,7 +21,16 @@ import pickle, math
 import wikipedia
 from pyvis.network import Network
 import torch
-from InstructorEmbedding import INSTRUCTOR
 nltk.download('punkt')
@@ -32,6 +41,59 @@ time_str = time.strftime("%d%m%Y-%H%M%S")
 HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
 margin-bottom: 2.5rem">{}</div> """
 @st.experimental_singleton(suppress_st_warning=True)
 def load_models():
     q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
@@ -40,12 +102,13 @@ def load_models():
     kg_tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
     q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
     ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
     sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
     sum_pipe = pipeline("summarization",model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn",clean_up_tokenization_spaces=True)
     ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
     cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1') #cross-encoder/ms-marco-MiniLM-L-12-v2
-    return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer
 @st.experimental_singleton(suppress_st_warning=True)
 def load_asr_model(asr_model_name):
@@ -62,6 +125,58 @@ def load_sbert(model_name):
     return sbert
 @st.experimental_memo(suppress_st_warning=True)
 def embed_text(query,corpus,embedding_model):

 import wikipedia
 from pyvis.network import Network
 import torch
+from langchain.docstore.document import Document
+from langchain.embeddings import HuggingFaceEmbeddings,HuggingFaceInstructEmbeddings
+from langchain.vectorstores import Pinecone
+from langchain.chains.qa_with_sources import load_qa_with_sources_chain
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.llms import OpenAI
+from langchain import VectorDBQA
+from langchain.chains.question_answering import load_qa_chain
+from langchain.prompts import PromptTemplate
+from langchain.prompts.base import RegexParser
 nltk.download('punkt')
 HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem;
 margin-bottom: 2.5rem">{}</div> """
+#Stuff Chain Type Prompt template
+output_parser = RegexParser(
+    regex=r"(.*?)\nScore: (.*)",
+    output_keys=["answer", "score"],
+)
+template = """Given the following extracted parts of a long document and a question, create a final answer with references ("SOURCES").
+If you don't know the answer, just say that you don't know. Don't try to make up an answer.
+ALWAYS return a "SOURCES" part in your answer.
+In addition to giving an answer, also return a score of how fully it answered the user's question. This should be in the following format:
+Question: [question here]
+Helpful Answer: [answer here]
+Score: [score between 0 and 100]
+Begin!
+Context:
+---------
+{summaries}
+---------
+Question: {question}
+Helpful Answer:"""
+#Refine Chain Type Prompt Template
+refine_prompt_template = (
+    "The original question is as follows: {question}\n"
+    "We have provided an existing answer: {existing_answer}\n"
+    "We have the opportunity to refine the existing answer"
+    "(only if needed) with some more context below.\n"
+    "------------\n"
+    "{context_str}\n"
+    "------------\n"
+    "Given the new context, refine the original answer to better "
+    "answer the question. "
+    "If the context isn't useful, return the original answer."
+)
+refine_prompt = PromptTemplate(
+    input_variables=["question", "existing_answer", "context_str"],
+    template=refine_prompt_template,
+)
+initial_qa_template = (
+    "Context information is below. \n"
+    "---------------------\n"
+    "{context_str}"
+    "\n---------------------\n"
+    "Given the context information and not prior knowledge, "
+    "answer the question: {question}\n.\n"
+)
 @st.experimental_singleton(suppress_st_warning=True)
 def load_models():
     q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
     kg_tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
     q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
     ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
+    emb_tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-xl')
     sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
     sum_pipe = pipeline("summarization",model="facebook/bart-large-cnn", tokenizer="facebook/bart-large-cnn",clean_up_tokenization_spaces=True)
     ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
     cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1') #cross-encoder/ms-marco-MiniLM-L-12-v2
+    return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer
 @st.experimental_singleton(suppress_st_warning=True)
 def load_asr_model(asr_model_name):
     return sbert
+@st.experimental_memo(suppress_st_warning=True)
+def embed_text(query,corpus,title,embedding_model,chain_type='stuff'):
+    '''Embed text and generate semantic search scores'''
+    index_id = "earnings-embeddings"
+    if 'hkunlp' in embedding_model:
+        embeddings = HuggingFaceInstructEmbeddings(model_name=f'hkunlp/{embedding_model}',
+                                           query_instruction='Represent the Financial question for retrieving supporting paragraphs: ',
+                                           embed_instruction='Represent the Financial paragraph for retrieval: ')
+    else:
+        embeddings = HuggingFaceEmbeddings(model_name=f'sentence-transformers/{embedding_model}')
+    docsearch = Pinecone.from_texts(
+        corpus,
+        embeddings,
+        index_name = index_id,
+        namespace = f'{title}-earnings',
+        metadatas = [
+        {'source':i} for i in range(len(texts))]
+    )
+    docs = docsearch.similarity_search_with_score(query, k=3, namespace = f'{title}-earnings')
+    docs = [d[0] for d in docs]
+    if chain_type == 'stuff':
+        PROMPT = PromptTemplate(template=template,
+                                input_variables=["summaries", "question"],
+                                output_parser=output_parser)
+        chain = load_qa_with_sources_chain(OpenAI(temperature=0),
+                                           chain_type="stuff",
+                                           prompt=PROMPT,
+                                           )
+        answer = chain({"input_documents": docs, "question": query}, return_only_outputs=True)
+        return answer['output_text']
+    elif chain_type == 'refine':
+    return hits
 @st.experimental_memo(suppress_st_warning=True)
 def embed_text(query,corpus,embedding_model):