Spaces:

pvanand
/

general_chat

Running

App Files Files Community

pvanand commited on May 28, 2024

Commit

1f1d19b

verified ·

1 Parent(s): f48a49c

Update helper_functions_api.py

Browse files

Files changed (1) hide show

helper_functions_api.py +36 -111

helper_functions_api.py CHANGED Viewed

@@ -70,20 +70,19 @@ from together import Together
 llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
 llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
-SysPromptJson = "You are now in the role of an expert AI who can extract structured information from user request. Both key and value pairs must be in double quotes. You must respond ONLY with a valid JSON file. Do not add any additional comments."
-SysPromptList = "You are now in the role of an expert AI who can extract structured information from user request. All elements must be in double quotes. You must respond ONLY with a valid python List. Do not add any additional comments."
 SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."
 import tiktoken # Used to limit tokens
 encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better
-def limit_tokens(input_string, token_limit=8000):
     """
     Limit tokens sent to the model
     """
     return encoding.decode(encoding.encode(input_string)[:token_limit])
-def together_response(message, model = "meta-llama/Llama-3-8b-chat-hf", SysPrompt = SysPromptDefault, temperature=0.2):
     client = OpenAI(
         api_key=TOGETHER_API_KEY,
         base_url="https://together.hconeai.com/v1",
@@ -95,6 +94,7 @@ def together_response(message, model = "meta-llama/Llama-3-8b-chat-hf", SysPromp
         model=model,
         messages=messages,
         temperature=temperature,
         )
     return response.choices[0].message.content
@@ -122,11 +122,27 @@ def remove_stopwords(text):
     filtered_text = [word for word in words if word.lower() not in stop_words]
     return ' '.join(filtered_text)
-def rephrase_content(content, query):
-    return together_response(f"You are an information retriever and summarizer,ignore everything you know, return only the\
-    factual information regarding the query: {{{query}}} into a maximum of {500} words. Output should be concise chunks of \
-    paragraphs or tables or both, ignore links, using the scraped context:{{{content}}}")
 class Scraper:
     def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
         self.session = requests.Session()
@@ -151,23 +167,31 @@ def extract_main_content(html):
         return plain_text
     return ""
-def process_content(url, query):
     scraper = Scraper()
     html_content = scraper.fetch_content(url)
     if html_content:
         content = extract_main_content(html_content)
         if content:
-            rephrased_content = rephrase_content(limit_tokens(remove_stopwords(content)), query)
             return rephrased_content, url
     return "", url
-def fetch_and_extract_content(urls, query):
     with ThreadPoolExecutor(max_workers=len(urls)) as executor:
-        future_to_url = {executor.submit(process_content, url, query): url for url in urls}
         all_text_with_urls = [future.result() for future in as_completed(future_to_url)]
     return all_text_with_urls
 def search_brave(query, num_results=5):
     brave = Brave(BRAVE_API_KEY)
@@ -176,103 +200,4 @@ def search_brave(query, num_results=5):
     return [url.__str__() for url in search_results.urls]
-def generate_report_with_reference(full_data):
-    """
-    Generate HTML report with references and saves pdf report to "generated_pdf_report.pdf"
-    """
-    pdf = FPDF()
-    with open("report_with_references_template.html") as f: # src/research-pro/app_v1.5_online/
-        html_template = f.read()
-    # Loop through each row in your dataset
-    html_report = ''
-    idx = 1
-    for subtopic_data in full_data:
-        md_report = md_to_html(subtopic_data['md_report'])
-        # Convert the string representation of a list of tuples back to a list of tuples
-        references = ast.literal_eval(subtopic_data['text_with_urls'])
-        collapsible_blocks = []
-        for ref_idx, reference in enumerate(references):
-            ref_text = md_to_html(reference[0])
-            ref_url = reference[1]
-            urls_html = ''.join(f'<a href="{ref_url}"> {ref_url}</a>')
-            collapsible_block = '''
-            <details>
-                <summary>Reference {}: {}</summary>
-                <div>
-                    <p>{}</p>
-                    <ul>{}</ul>
-                </div>
-            </details>
-            '''.format(ref_idx+1, urls_html, ref_text, urls_html)
-            collapsible_blocks.append(collapsible_block)
-        references_html = '\n'.join(collapsible_blocks)
-        template = Template(html_template)
-        html_page = template.render(md_report=md_report, references=references_html)
-        pdf.add_page()
-        pdf_report = f"<h1><strong>Report {idx}</strong></h1>"+md_report+f"<h1><strong>References for Report {idx}</strong></h1>"+references_html
-        pdf.write_html(pdf_report.encode('ascii', 'ignore').decode('ascii')) # Filter non-asci characters
-        html_report += html_page
-        idx+=1
-    pdf.output("generated_pdf_report.pdf")
-    return html_report
-def write_dataframes_to_excel(dataframes_list, filename):
-    """
-    input: [df_list1, df_list2, ..]
-    saves filename.xlsx
-    """
-    try:
-        with pd.ExcelWriter(filename, engine="openpyxl") as writer:
-            for idx, dataframes in enumerate(dataframes_list):
-                startrow = 0
-                for idx2, df in enumerate(dataframes):
-                    df.to_excel(writer, sheet_name=f"Sheet{idx+1}", startrow=startrow, index=False)
-                    startrow += len(df) + 2
-    except:
-        # Empty dataframe due to no tables found, file is not written
-        pass
-def extract_tables_from_html(html_file):
-    """
-    input: html_file
-    output: [df1,df2,df3,..]
-    """
-    # Initialize an empty list to store the dataframes
-    dataframes = []
-    # Open the HTML file and parse it with BeautifulSoup
-    soup = BeautifulSoup(html_file, 'html.parser')
-    # Find all the tables in the HTML file
-    tables = soup.find_all('table')
-    # Iterate through each table
-    for table in tables:
-        # Extract the table headers
-        headers = [th.text for th in table.find_all('th')]
-        # Extract the table data
-        rows = table.find_all('tr')
-        data = []
-        for row in rows:
-            row_data = [td.text for td in row.find_all('td')]
-            data.append(row_data)
-        # Create a dataframe from the headers and data
-        df = pd.DataFrame(data, columns=headers)
-        # Append the dataframe to the list of dataframes
-        dataframes.append(df)
-    # Return the list of dataframes
-    return dataframes

 llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
 llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
+SysPromptData = "You are an information retriever and summarizer, return only the factual information regarding the user query"
 SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments."
 import tiktoken # Used to limit tokens
 encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better
+def limit_tokens(input_string, token_limit=7500):
     """
     Limit tokens sent to the model
     """
     return encoding.decode(encoding.encode(input_string)[:token_limit])
+def together_response(message, model = "meta-llama/Llama-3-8b-chat-hf", SysPrompt = SysPromptDefault, temperature=0.2, frequency_penalty =0.1, max_tokens= 2000):
     client = OpenAI(
         api_key=TOGETHER_API_KEY,
         base_url="https://together.hconeai.com/v1",
         model=model,
         messages=messages,
         temperature=temperature,
+        frequency_penalty = frequency_penalty
         )
     return response.choices[0].message.content
     filtered_text = [word for word in words if word.lower() not in stop_words]
     return ' '.join(filtered_text)
+def rephrase_content(data_format, content, query):
+    if data_format == "Structured data":
+        return together_response(
+            f"return only the factual information regarding the query: {{{query}}}. Output should be concise chunks of \
+    paragraphs or tables or both, using the scraped context:{{{limit_tokens(content)}}}",
+            SysPrompt=SysPromptData,
+            max_tokens=500,
+        )
+    elif data_format == "Quantitative data":
+        return together_response(
+            f"return only the numerical or quantitative data regarding the query: {{{query}}} structured into .md tables, using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}",
+            SysPrompt=SysPromptData,
+            max_tokens=500,
+        )
+    else:
+        return together_response(
+            f"return only the factual information regarding the query: {{{query}}} using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}",
+            SysPrompt=SysPromptData,
+            max_tokens=500,
+        )
 class Scraper:
     def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"):
         self.session = requests.Session()
         return plain_text
     return ""
+def process_content(data_format, url, query):
     scraper = Scraper()
     html_content = scraper.fetch_content(url)
     if html_content:
         content = extract_main_content(html_content)
         if content:
+            rephrased_content = rephrase_content(
+                data_format=data_format,
+                content=limit_tokens(remove_stopwords(content), token_limit=1000),
+                query=query,
+            )
             return rephrased_content, url
     return "", url
+def fetch_and_extract_content(data_format, urls, query):
     with ThreadPoolExecutor(max_workers=len(urls)) as executor:
+        future_to_url = {
+            executor.submit(process_content, data_format, url, query): url
+            for url in urls
+        }
         all_text_with_urls = [future.result() for future in as_completed(future_to_url)]
     return all_text_with_urls
 def search_brave(query, num_results=5):
     brave = Brave(BRAVE_API_KEY)
     return [url.__str__() for url in search_results.urls]