|  | import whisper | 
					
						
						|  | import os | 
					
						
						|  | import random | 
					
						
						|  | import openai | 
					
						
						|  | from openai import OpenAI | 
					
						
						|  | import yt_dlp | 
					
						
						|  | from pytube import YouTube, extract | 
					
						
						|  | import pandas as pd | 
					
						
						|  | import plotly_express as px | 
					
						
						|  | import nltk | 
					
						
						|  | import plotly.graph_objects as go | 
					
						
						|  | from optimum.onnxruntime import ORTModelForSequenceClassification | 
					
						
						|  | from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, AutoModelForTokenClassification | 
					
						
						|  | from sentence_transformers import SentenceTransformer, CrossEncoder, util | 
					
						
						|  | import streamlit as st | 
					
						
						|  | import en_core_web_lg | 
					
						
						|  | import validators | 
					
						
						|  | import re | 
					
						
						|  | import itertools | 
					
						
						|  | import numpy as np | 
					
						
						|  | from bs4 import BeautifulSoup | 
					
						
						|  | import base64, time | 
					
						
						|  | from annotated_text import annotated_text | 
					
						
						|  | import pickle, math | 
					
						
						|  | import torch | 
					
						
						|  | from pydub import AudioSegment | 
					
						
						|  | from langchain.docstore.document import Document | 
					
						
						|  | from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceBgeEmbeddings, HuggingFaceInstructEmbeddings | 
					
						
						|  | from langchain.vectorstores import FAISS | 
					
						
						|  | from langchain.text_splitter import RecursiveCharacterTextSplitter | 
					
						
						|  | from langchain.chat_models import ChatOpenAI | 
					
						
						|  | from langchain.chains import QAGenerationChain | 
					
						
						|  |  | 
					
						
						|  | from langchain.callbacks import StreamlitCallbackHandler | 
					
						
						|  | from langchain.agents import OpenAIFunctionsAgent, AgentExecutor | 
					
						
						|  | from langchain.agents.agent_toolkits import create_retriever_tool | 
					
						
						|  | from langchain.agents.openai_functions_agent.agent_token_buffer_memory import ( | 
					
						
						|  | AgentTokenBufferMemory, | 
					
						
						|  | ) | 
					
						
						|  | from langchain.prompts import MessagesPlaceholder | 
					
						
						|  |  | 
					
						
						|  | from langchain.prompts.chat import ( | 
					
						
						|  | ChatPromptTemplate, | 
					
						
						|  | SystemMessagePromptTemplate, | 
					
						
						|  | AIMessagePromptTemplate, | 
					
						
						|  | HumanMessagePromptTemplate, | 
					
						
						|  | ) | 
					
						
						|  | from langchain.schema import ( | 
					
						
						|  | AIMessage, | 
					
						
						|  | HumanMessage, | 
					
						
						|  | SystemMessage | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | from langchain.prompts import PromptTemplate | 
					
						
						|  |  | 
					
						
						|  | from langsmith import Client | 
					
						
						|  |  | 
					
						
						|  | client = Client() | 
					
						
						|  | openai_audio = OpenAI() | 
					
						
						|  | nltk.download('punkt') | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | from nltk import sent_tokenize | 
					
						
						|  |  | 
					
						
						|  | OPEN_AI_KEY = os.environ.get('OPEN_AI_KEY') | 
					
						
						|  | time_str = time.strftime("%d%m%Y-%H%M%S") | 
					
						
						|  | HTML_WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; | 
					
						
						|  | margin-bottom: 2.5rem">{}</div> """ | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @st.cache_resource | 
					
						
						|  | def load_models(): | 
					
						
						|  |  | 
					
						
						|  | '''Load and cache all the models to be used''' | 
					
						
						|  | q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone") | 
					
						
						|  | ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english") | 
					
						
						|  | q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone") | 
					
						
						|  | ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english") | 
					
						
						|  | sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer) | 
					
						
						|  | sum_pipe = pipeline("summarization",model="philschmid/flan-t5-base-samsum",clean_up_tokenization_spaces=True) | 
					
						
						|  | ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True) | 
					
						
						|  | cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2') | 
					
						
						|  | sbert = SentenceTransformer('all-MiniLM-L6-v2') | 
					
						
						|  |  | 
					
						
						|  | return sent_pipe, sum_pipe, ner_pipe, cross_encoder, sbert | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def load_asr_model(model_name): | 
					
						
						|  |  | 
					
						
						|  | '''Load the open source  whisper model in cases where the API is not working''' | 
					
						
						|  | model = whisper.load_model(model_name) | 
					
						
						|  |  | 
					
						
						|  | return model | 
					
						
						|  |  | 
					
						
						|  | @st.cache_resource | 
					
						
						|  | def get_spacy(): | 
					
						
						|  | nlp = en_core_web_lg.load() | 
					
						
						|  | return nlp | 
					
						
						|  |  | 
					
						
						|  | nlp = get_spacy() | 
					
						
						|  |  | 
					
						
						|  | sent_pipe, sum_pipe, ner_pipe, cross_encoder, sbert  = load_models() | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def get_yt_audio(url): | 
					
						
						|  |  | 
					
						
						|  | '''Get YT video from given URL link''' | 
					
						
						|  | yt = YouTube(url) | 
					
						
						|  |  | 
					
						
						|  | title = yt.title | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | audio_stream =  yt.streams.filter(only_audio=True).first().download() | 
					
						
						|  |  | 
					
						
						|  | return audio_stream, title | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def get_yt_audio_dl(url): | 
					
						
						|  |  | 
					
						
						|  | '''Back up for when pytube is down''' | 
					
						
						|  |  | 
					
						
						|  | temp_audio_file = os.path.join('output', 'audio') | 
					
						
						|  |  | 
					
						
						|  | ydl_opts = { | 
					
						
						|  | 'format': 'bestaudio/best', | 
					
						
						|  | 'postprocessors': [{ | 
					
						
						|  | 'key': 'FFmpegExtractAudio', | 
					
						
						|  | 'preferredcodec': 'mp3', | 
					
						
						|  | 'preferredquality': '192', | 
					
						
						|  | }], | 
					
						
						|  | 'outtmpl': temp_audio_file, | 
					
						
						|  | 'quiet': True, | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  | with yt_dlp.YoutubeDL(ydl_opts) as ydl: | 
					
						
						|  |  | 
					
						
						|  | info = ydl.extract_info(url, download=False) | 
					
						
						|  | title = info.get('title', None) | 
					
						
						|  | ydl.download([url]) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | audio_file = os.path.join('output', 'audio.mp3') | 
					
						
						|  |  | 
					
						
						|  | return audio_file, title | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def load_whisper_api(audio): | 
					
						
						|  |  | 
					
						
						|  | '''Transcribe YT audio to text using Open AI API''' | 
					
						
						|  | file = open(audio, "rb") | 
					
						
						|  | transcript = openai_audio.audio.transcriptions.create(model="whisper-1", file=file,response_format="text") | 
					
						
						|  |  | 
					
						
						|  | return transcript | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def transcribe_yt_video(link, py_tube=True): | 
					
						
						|  | '''Transcribe YouTube video''' | 
					
						
						|  |  | 
					
						
						|  | if py_tube: | 
					
						
						|  |  | 
					
						
						|  | audio_file, title = get_yt_audio(link) | 
					
						
						|  |  | 
					
						
						|  | print(f'audio_file:{audio_file}') | 
					
						
						|  |  | 
					
						
						|  | st.session_state['audio'] = audio_file | 
					
						
						|  |  | 
					
						
						|  | print(f"audio_file_session_state:{st.session_state['audio'] }") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | audio_size = round(os.path.getsize(st.session_state['audio'])/(1024*1024),1) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if audio_size <= 25: | 
					
						
						|  |  | 
					
						
						|  | st.info("`Transcribing YT audio...`") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | results = load_whisper_api(st.session_state['audio']) | 
					
						
						|  |  | 
					
						
						|  | else: | 
					
						
						|  |  | 
					
						
						|  | st.warning('File size larger than 24mb, applying chunking and transcription',icon="โ ๏ธ") | 
					
						
						|  |  | 
					
						
						|  | song = AudioSegment.from_file(st.session_state['audio'], format='mp3') | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | twenty_minutes = 20 * 60 * 1000 | 
					
						
						|  |  | 
					
						
						|  | chunks = song[::twenty_minutes] | 
					
						
						|  |  | 
					
						
						|  | transcriptions = [] | 
					
						
						|  |  | 
					
						
						|  | video_id = extract.video_id(link) | 
					
						
						|  |  | 
					
						
						|  | print(video_id) | 
					
						
						|  |  | 
					
						
						|  | for i, chunk in enumerate(chunks): | 
					
						
						|  | chunk.export(f'output/chunk_{i}_{video_id}.mp4', format='mp3') | 
					
						
						|  | transcriptions.append(load_whisper_api(f'output/chunk_{i}_{video_id}.mp3')) | 
					
						
						|  |  | 
					
						
						|  | results = ','.join(transcriptions) | 
					
						
						|  |  | 
					
						
						|  | print(results) | 
					
						
						|  |  | 
					
						
						|  | else: | 
					
						
						|  |  | 
					
						
						|  | audio_file, title = get_yt_audio_dl(link) | 
					
						
						|  |  | 
					
						
						|  | print(f'audio_file:{audio_file}') | 
					
						
						|  |  | 
					
						
						|  | st.session_state['audio'] = audio_file | 
					
						
						|  |  | 
					
						
						|  | print(f"audio_file_session_state:{st.session_state['audio'] }") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | audio_size = round(os.path.getsize(st.session_state['audio'])/(1024*1024),1) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if audio_size <= 25: | 
					
						
						|  |  | 
					
						
						|  | st.info("`Transcribing YT audio...`") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | results = load_whisper_api(st.session_state['audio']) | 
					
						
						|  |  | 
					
						
						|  | else: | 
					
						
						|  |  | 
					
						
						|  | st.warning('File size larger than 24mb, applying chunking and transcription',icon="โ ๏ธ") | 
					
						
						|  |  | 
					
						
						|  | song = AudioSegment.from_file(st.session_state['audio'], format='mp3') | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | twenty_minutes = 20 * 60 * 1000 | 
					
						
						|  |  | 
					
						
						|  | chunks = song[::twenty_minutes] | 
					
						
						|  |  | 
					
						
						|  | transcriptions = [] | 
					
						
						|  |  | 
					
						
						|  | video_id = extract.video_id(link) | 
					
						
						|  |  | 
					
						
						|  | for i, chunk in enumerate(chunks): | 
					
						
						|  | chunk.export(f'output/chunk_{i}_{video_id}.mp3', format='mp3') | 
					
						
						|  | transcriptions.append(load_whisper_api(f'output/chunk_{i}_{video_id}.mp3')) | 
					
						
						|  |  | 
					
						
						|  | results = ','.join(transcriptions) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | st.info("`YT Video transcription process complete...`") | 
					
						
						|  |  | 
					
						
						|  | return results, title | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def inference(link, upload): | 
					
						
						|  | '''Convert Youtube video or Audio upload to text''' | 
					
						
						|  |  | 
					
						
						|  | try: | 
					
						
						|  |  | 
					
						
						|  | if validators.url(link): | 
					
						
						|  |  | 
					
						
						|  | st.info("`Downloading YT audio...`") | 
					
						
						|  |  | 
					
						
						|  | results, title = transcribe_yt_video(link) | 
					
						
						|  |  | 
					
						
						|  | return results, title | 
					
						
						|  |  | 
					
						
						|  | elif _upload: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | audio_size = round(os.path.getsize(_upload)/(1024*1024),1) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if audio_size <= 25: | 
					
						
						|  |  | 
					
						
						|  | st.info("`Transcribing uploaded audio...`") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | results = load_whisper_api(_upload)['text'] | 
					
						
						|  |  | 
					
						
						|  | else: | 
					
						
						|  |  | 
					
						
						|  | st.write('File size larger than 24mb, applying chunking and transcription') | 
					
						
						|  |  | 
					
						
						|  | song = AudioSegment.from_file(_upload) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | twenty_minutes = 20 * 60 * 1000 | 
					
						
						|  |  | 
					
						
						|  | chunks = song[::twenty_minutes] | 
					
						
						|  |  | 
					
						
						|  | transcriptions = [] | 
					
						
						|  |  | 
					
						
						|  | st.info("`Transcribing uploaded audio...`") | 
					
						
						|  |  | 
					
						
						|  | for i, chunk in enumerate(chunks): | 
					
						
						|  | chunk.export(f'output/chunk_{i}.mp4', format='mp4') | 
					
						
						|  | transcriptions.append(load_whisper_api(f'output/chunk_{i}.mp4')['text']) | 
					
						
						|  |  | 
					
						
						|  | results = ','.join(transcriptions) | 
					
						
						|  |  | 
					
						
						|  | st.info("`Uploaded audio transcription process complete...`") | 
					
						
						|  |  | 
					
						
						|  | return results, "Transcribed Earnings Audio" | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  |  | 
					
						
						|  | st.error(f'''PyTube Error: {e}, | 
					
						
						|  | Using yt_dlp module, might take longer than expected''',icon="๐จ") | 
					
						
						|  |  | 
					
						
						|  | results, title = transcribe_yt_video(link, py_tube=False) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | return results, title | 
					
						
						|  |  | 
					
						
						|  | @st.cache_resource | 
					
						
						|  | def send_feedback(run_id, score): | 
					
						
						|  | client.create_feedback(run_id, "user_score", score=score) | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def clean_text(text): | 
					
						
						|  | '''Clean all text after inference''' | 
					
						
						|  |  | 
					
						
						|  | text = text.encode("ascii", "ignore").decode() | 
					
						
						|  | text = re.sub(r"https*\S+", " ", text) | 
					
						
						|  | text = re.sub(r"@\S+", " ", text) | 
					
						
						|  | text = re.sub(r"#\S+", " ", text) | 
					
						
						|  | text = re.sub(r"\s{2,}", " ", text) | 
					
						
						|  |  | 
					
						
						|  | return text | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def chunk_long_text(text,threshold,window_size=3,stride=2): | 
					
						
						|  | '''Preprocess text and chunk for sentiment analysis''' | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | sentences = sent_tokenize(text) | 
					
						
						|  | out = [] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for chunk in sentences: | 
					
						
						|  | if len(chunk.split()) < threshold: | 
					
						
						|  | out.append(chunk) | 
					
						
						|  | else: | 
					
						
						|  | words = chunk.split() | 
					
						
						|  | num = int(len(words)/threshold) | 
					
						
						|  | for i in range(0,num*threshold+1,threshold): | 
					
						
						|  | out.append(' '.join(words[i:threshold+i])) | 
					
						
						|  |  | 
					
						
						|  | passages = [] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for paragraph in [out]: | 
					
						
						|  | for start_idx in range(0, len(paragraph), stride): | 
					
						
						|  | end_idx = min(start_idx+window_size, len(paragraph)) | 
					
						
						|  | passages.append(" ".join(paragraph[start_idx:end_idx])) | 
					
						
						|  |  | 
					
						
						|  | return passages | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def sentiment_pipe(earnings_text): | 
					
						
						|  | '''Determine the sentiment of the text''' | 
					
						
						|  |  | 
					
						
						|  | earnings_sentences = chunk_long_text(earnings_text,150,1,1) | 
					
						
						|  | earnings_sentiment = sent_pipe(earnings_sentences) | 
					
						
						|  |  | 
					
						
						|  | return earnings_sentiment, earnings_sentences | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def chunk_and_preprocess_text(text, model_name= 'philschmid/flan-t5-base-samsum'): | 
					
						
						|  |  | 
					
						
						|  | '''Chunk and preprocess text for summarization''' | 
					
						
						|  |  | 
					
						
						|  | tokenizer = AutoTokenizer.from_pretrained(model_name) | 
					
						
						|  | sentences = sent_tokenize(text) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | length = 0 | 
					
						
						|  | chunk = "" | 
					
						
						|  | chunks = [] | 
					
						
						|  | count = -1 | 
					
						
						|  |  | 
					
						
						|  | for sentence in sentences: | 
					
						
						|  | count += 1 | 
					
						
						|  | combined_length = len(tokenizer.tokenize(sentence)) + length | 
					
						
						|  |  | 
					
						
						|  | if combined_length  <= tokenizer.max_len_single_sentence: | 
					
						
						|  | chunk += sentence + " " | 
					
						
						|  | length = combined_length | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if count == len(sentences) - 1: | 
					
						
						|  | chunks.append(chunk) | 
					
						
						|  |  | 
					
						
						|  | else: | 
					
						
						|  | chunks.append(chunk) | 
					
						
						|  |  | 
					
						
						|  | length = 0 | 
					
						
						|  | chunk = "" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | chunk += sentence + " " | 
					
						
						|  | length = len(tokenizer.tokenize(sentence)) | 
					
						
						|  |  | 
					
						
						|  | return chunks | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def summarize_text(text_to_summarize,max_len,min_len): | 
					
						
						|  | '''Summarize text with HF model''' | 
					
						
						|  |  | 
					
						
						|  | summarized_text = sum_pipe(text_to_summarize, | 
					
						
						|  | max_length=max_len, | 
					
						
						|  | min_length=min_len, | 
					
						
						|  | do_sample=False, | 
					
						
						|  | early_stopping=True, | 
					
						
						|  | num_beams=4) | 
					
						
						|  | summarized_text = ' '.join([summ['summary_text'] for summ in summarized_text]) | 
					
						
						|  |  | 
					
						
						|  | return summarized_text | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def get_all_entities_per_sentence(text): | 
					
						
						|  | doc = nlp(''.join(text)) | 
					
						
						|  |  | 
					
						
						|  | sentences = list(doc.sents) | 
					
						
						|  |  | 
					
						
						|  | entities_all_sentences = [] | 
					
						
						|  | for sentence in sentences: | 
					
						
						|  | entities_this_sentence = [] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for entity in sentence.ents: | 
					
						
						|  | entities_this_sentence.append(str(entity)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | entities_xlm = [entity["word"] for entity in ner_pipe(str(sentence))] | 
					
						
						|  | for entity in entities_xlm: | 
					
						
						|  | entities_this_sentence.append(str(entity)) | 
					
						
						|  |  | 
					
						
						|  | entities_all_sentences.append(entities_this_sentence) | 
					
						
						|  |  | 
					
						
						|  | return entities_all_sentences | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def get_all_entities(text): | 
					
						
						|  | all_entities_per_sentence = get_all_entities_per_sentence(text) | 
					
						
						|  | return list(itertools.chain.from_iterable(all_entities_per_sentence)) | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def get_and_compare_entities(article_content,summary_output): | 
					
						
						|  |  | 
					
						
						|  | all_entities_per_sentence = get_all_entities_per_sentence(article_content) | 
					
						
						|  | entities_article = list(itertools.chain.from_iterable(all_entities_per_sentence)) | 
					
						
						|  |  | 
					
						
						|  | all_entities_per_sentence = get_all_entities_per_sentence(summary_output) | 
					
						
						|  | entities_summary = list(itertools.chain.from_iterable(all_entities_per_sentence)) | 
					
						
						|  |  | 
					
						
						|  | matched_entities = [] | 
					
						
						|  | unmatched_entities = [] | 
					
						
						|  | for entity in entities_summary: | 
					
						
						|  | if any(entity.lower() in substring_entity.lower() for substring_entity in entities_article): | 
					
						
						|  | matched_entities.append(entity) | 
					
						
						|  | elif any( | 
					
						
						|  | np.inner(sbert.encode(entity, show_progress_bar=False), | 
					
						
						|  | sbert.encode(art_entity, show_progress_bar=False)) > 0.9 for | 
					
						
						|  | art_entity in entities_article): | 
					
						
						|  | matched_entities.append(entity) | 
					
						
						|  | else: | 
					
						
						|  | unmatched_entities.append(entity) | 
					
						
						|  |  | 
					
						
						|  | matched_entities = list(dict.fromkeys(matched_entities)) | 
					
						
						|  | unmatched_entities = list(dict.fromkeys(unmatched_entities)) | 
					
						
						|  |  | 
					
						
						|  | matched_entities_to_remove = [] | 
					
						
						|  | unmatched_entities_to_remove = [] | 
					
						
						|  |  | 
					
						
						|  | for entity in matched_entities: | 
					
						
						|  | for substring_entity in matched_entities: | 
					
						
						|  | if entity != substring_entity and entity.lower() in substring_entity.lower(): | 
					
						
						|  | matched_entities_to_remove.append(entity) | 
					
						
						|  |  | 
					
						
						|  | for entity in unmatched_entities: | 
					
						
						|  | for substring_entity in unmatched_entities: | 
					
						
						|  | if entity != substring_entity and entity.lower() in substring_entity.lower(): | 
					
						
						|  | unmatched_entities_to_remove.append(entity) | 
					
						
						|  |  | 
					
						
						|  | matched_entities_to_remove = list(dict.fromkeys(matched_entities_to_remove)) | 
					
						
						|  | unmatched_entities_to_remove = list(dict.fromkeys(unmatched_entities_to_remove)) | 
					
						
						|  |  | 
					
						
						|  | for entity in matched_entities_to_remove: | 
					
						
						|  | matched_entities.remove(entity) | 
					
						
						|  | for entity in unmatched_entities_to_remove: | 
					
						
						|  | unmatched_entities.remove(entity) | 
					
						
						|  |  | 
					
						
						|  | return matched_entities, unmatched_entities | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def highlight_entities(article_content,summary_output): | 
					
						
						|  |  | 
					
						
						|  | markdown_start_red = "<mark class=\"entity\" style=\"background: rgb(238, 135, 135);\">" | 
					
						
						|  | markdown_start_green = "<mark class=\"entity\" style=\"background: rgb(121, 236, 121);\">" | 
					
						
						|  | markdown_end = "</mark>" | 
					
						
						|  |  | 
					
						
						|  | matched_entities, unmatched_entities = get_and_compare_entities(article_content,summary_output) | 
					
						
						|  |  | 
					
						
						|  | for entity in matched_entities: | 
					
						
						|  | summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_green + entity + markdown_end,summary_output) | 
					
						
						|  |  | 
					
						
						|  | for entity in unmatched_entities: | 
					
						
						|  | summary_output = re.sub(f'({entity})(?![^rgb\(]*\))',markdown_start_red + entity + markdown_end,summary_output) | 
					
						
						|  |  | 
					
						
						|  | print("") | 
					
						
						|  | print("") | 
					
						
						|  |  | 
					
						
						|  | soup = BeautifulSoup(summary_output, features="html.parser") | 
					
						
						|  |  | 
					
						
						|  | return HTML_WRAPPER.format(soup) | 
					
						
						|  |  | 
					
						
						|  | def summary_downloader(raw_text): | 
					
						
						|  | '''Download the summary generated''' | 
					
						
						|  |  | 
					
						
						|  | b64 = base64.b64encode(raw_text.encode()).decode() | 
					
						
						|  | new_filename = "new_text_file_{}_.txt".format(time_str) | 
					
						
						|  | st.markdown("#### Download Summary as a File ###") | 
					
						
						|  | href = f'<a href="data:file/txt;base64,{b64}" download="{new_filename}">Click to Download!!</a>' | 
					
						
						|  | st.markdown(href,unsafe_allow_html=True) | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def generate_eval(raw_text, N, chunk): | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | update = st.empty() | 
					
						
						|  | ques_update = st.empty() | 
					
						
						|  | update.info("`Generating sample questions ...`") | 
					
						
						|  | n = len(raw_text) | 
					
						
						|  | starting_indices = [random.randint(0, n-chunk) for _ in range(N)] | 
					
						
						|  | sub_sequences = [raw_text[i:i+chunk] for i in starting_indices] | 
					
						
						|  | chain = QAGenerationChain.from_llm(ChatOpenAI(temperature=0)) | 
					
						
						|  | eval_set = [] | 
					
						
						|  |  | 
					
						
						|  | for i, b in enumerate(sub_sequences): | 
					
						
						|  | try: | 
					
						
						|  | qa = chain.run(b) | 
					
						
						|  | eval_set.append(qa) | 
					
						
						|  | ques_update.info(f"Creating Question: {i+1}") | 
					
						
						|  |  | 
					
						
						|  | except Exception as e: | 
					
						
						|  | print(e) | 
					
						
						|  | st.warning(f'Error in generating Question: {i+1}...', icon="โ ๏ธ") | 
					
						
						|  | continue | 
					
						
						|  |  | 
					
						
						|  | eval_set_full = list(itertools.chain.from_iterable(eval_set)) | 
					
						
						|  |  | 
					
						
						|  | update.empty() | 
					
						
						|  | ques_update.empty() | 
					
						
						|  |  | 
					
						
						|  | return eval_set_full | 
					
						
						|  |  | 
					
						
						|  | @st.cache_resource | 
					
						
						|  | def create_prompt_and_llm(): | 
					
						
						|  | '''Create prompt''' | 
					
						
						|  |  | 
					
						
						|  | llm = ChatOpenAI(temperature=0, streaming=True, model="gpt-4o") | 
					
						
						|  |  | 
					
						
						|  | message = SystemMessage( | 
					
						
						|  | content=( | 
					
						
						|  | "You are a helpful chatbot who is tasked with answering questions acuurately about earnings call transcript provided. " | 
					
						
						|  | "Unless otherwise explicitly stated, it is probably fair to assume that questions are about the earnings call transcript. " | 
					
						
						|  | "If there is any ambiguity, you probably assume they are about that." | 
					
						
						|  | "Do not use any information not provided in the earnings context and remember you are a to speak like a finance expert." | 
					
						
						|  | "If you don't know the answer, just say 'There is no relevant answer in the given earnings call transcript'" | 
					
						
						|  | "don't try to make up an answer" | 
					
						
						|  | ) | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | prompt = OpenAIFunctionsAgent.create_prompt( | 
					
						
						|  | system_message=message, | 
					
						
						|  | extra_prompt_messages=[MessagesPlaceholder(variable_name="history")], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | return prompt, llm | 
					
						
						|  |  | 
					
						
						|  | @st.cache_resource | 
					
						
						|  | def gen_embeddings(embedding_model): | 
					
						
						|  |  | 
					
						
						|  | '''Generate embeddings for given model''' | 
					
						
						|  |  | 
					
						
						|  | if 'hkunlp' in embedding_model: | 
					
						
						|  |  | 
					
						
						|  | embeddings = HuggingFaceInstructEmbeddings(model_name=embedding_model, | 
					
						
						|  | query_instruction='Represent the Financial question for retrieving supporting paragraphs: ', | 
					
						
						|  | embed_instruction='Represent the Financial paragraph for retrieval: ') | 
					
						
						|  |  | 
					
						
						|  | elif 'mpnet' in embedding_model: | 
					
						
						|  |  | 
					
						
						|  | embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | 
					
						
						|  |  | 
					
						
						|  | elif 'FlagEmbedding' in embedding_model: | 
					
						
						|  |  | 
					
						
						|  | encode_kwargs = {'normalize_embeddings': True} | 
					
						
						|  | embeddings = HuggingFaceBgeEmbeddings(model_name=embedding_model, | 
					
						
						|  | encode_kwargs = encode_kwargs | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | return embeddings | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def create_vectorstore(corpus, title, embedding_model, chunk_size=1000, overlap=50): | 
					
						
						|  |  | 
					
						
						|  | '''Process text for Semantic Search''' | 
					
						
						|  |  | 
					
						
						|  | text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=overlap) | 
					
						
						|  |  | 
					
						
						|  | texts = text_splitter.split_text(corpus) | 
					
						
						|  |  | 
					
						
						|  | embeddings = gen_embeddings(embedding_model) | 
					
						
						|  |  | 
					
						
						|  | vectorstore = FAISS.from_texts(texts, embeddings, metadatas=[{"source": i} for i in range(len(texts))]) | 
					
						
						|  |  | 
					
						
						|  | return vectorstore | 
					
						
						|  |  | 
					
						
						|  | @st.cache_resource | 
					
						
						|  | def create_memory_and_agent(_docsearch): | 
					
						
						|  |  | 
					
						
						|  | '''Embed text and generate semantic search scores''' | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | vectorstore = _docsearch.as_retriever(search_kwargs={"k": 4}) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | tool = create_retriever_tool( | 
					
						
						|  | vectorstore, | 
					
						
						|  | "earnings_call_search", | 
					
						
						|  | "Searches and returns documents using the earnings context provided as a source, relevant to the user input question.", | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | tools = [tool] | 
					
						
						|  |  | 
					
						
						|  | prompt,llm = create_prompt_and_llm() | 
					
						
						|  |  | 
					
						
						|  | agent = OpenAIFunctionsAgent(llm=llm, tools=tools, prompt=prompt) | 
					
						
						|  |  | 
					
						
						|  | agent_executor = AgentExecutor( | 
					
						
						|  | agent=agent, | 
					
						
						|  | tools=tools, | 
					
						
						|  | verbose=True, | 
					
						
						|  | return_intermediate_steps=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | memory = AgentTokenBufferMemory(llm=llm) | 
					
						
						|  |  | 
					
						
						|  | return memory, agent_executor | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def gen_sentiment(text): | 
					
						
						|  | '''Generate sentiment of given text''' | 
					
						
						|  | return sent_pipe(text)[0]['label'] | 
					
						
						|  |  | 
					
						
						|  | @st.cache_data | 
					
						
						|  | def gen_annotated_text(df): | 
					
						
						|  | '''Generate annotated text''' | 
					
						
						|  |  | 
					
						
						|  | tag_list=[] | 
					
						
						|  | for row in df.itertuples(): | 
					
						
						|  | label = row[2] | 
					
						
						|  | text = row[1] | 
					
						
						|  | if label == 'Positive': | 
					
						
						|  | tag_list.append((text,label,'#8fce00')) | 
					
						
						|  | elif label == 'Negative': | 
					
						
						|  | tag_list.append((text,label,'#f44336')) | 
					
						
						|  | else: | 
					
						
						|  | tag_list.append((text,label,'#000000')) | 
					
						
						|  |  | 
					
						
						|  | return tag_list | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def display_df_as_table(model,top_k,score='score'): | 
					
						
						|  | '''Display the df with text and scores as a table''' | 
					
						
						|  |  | 
					
						
						|  | df = pd.DataFrame([(hit[score],passages[hit['corpus_id']]) for hit in model[0:top_k]],columns=['Score','Text']) | 
					
						
						|  | df['Score'] = round(df['Score'],2) | 
					
						
						|  |  | 
					
						
						|  | return df | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def make_spans(text,results): | 
					
						
						|  | results_list = [] | 
					
						
						|  | for i in range(len(results)): | 
					
						
						|  | results_list.append(results[i]['label']) | 
					
						
						|  | facts_spans = [] | 
					
						
						|  | facts_spans = list(zip(sent_tokenizer(text),results_list)) | 
					
						
						|  | return facts_spans | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def fin_ext(text): | 
					
						
						|  | results = remote_clx(sent_tokenizer(text)) | 
					
						
						|  | return make_spans(text,results) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_article(url): | 
					
						
						|  | article = Article(url) | 
					
						
						|  | article.download() | 
					
						
						|  | article.parse() | 
					
						
						|  | return article | 
					
						
						|  |  | 
					
						
						|  |  |