Spaces:
Sleeping
Sleeping
| # !pip install mistune | |
| import mistune | |
| from mistune.plugins.table import table | |
| from jinja2 import Template | |
| import re | |
| import os | |
| def md_to_html(md_text): | |
| renderer = mistune.HTMLRenderer() | |
| markdown_renderer = mistune.Markdown(renderer, plugins=[table]) | |
| html_content = markdown_renderer(md_text) | |
| return html_content.replace('\n', '') | |
| ####------------------------------ OPTIONAL--> User id and persistant data storage-------------------------------------#### | |
| from datetime import datetime | |
| import psycopg2 | |
| from dotenv import load_dotenv, find_dotenv | |
| # Load environment variables from .env file | |
| load_dotenv("keys.env") | |
| TOGETHER_API_KEY = os.getenv('TOGETHER_API_KEY') | |
| BRAVE_API_KEY = os.getenv('BRAVE_API_KEY') | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| HELICON_API_KEY = os.getenv("HELICON_API_KEY") | |
| SUPABASE_USER = os.environ['SUPABASE_USER'] | |
| SUPABASE_PASSWORD = os.environ['SUPABASE_PASSWORD'] | |
| def insert_data(user_id, user_query, subtopic_query, response, html_report): | |
| # Connect to your database | |
| conn = psycopg2.connect( | |
| dbname="postgres", | |
| user=SUPABASE_USER, | |
| password=SUPABASE_PASSWORD, | |
| host="aws-0-us-west-1.pooler.supabase.com", | |
| port="5432" | |
| ) | |
| cur = conn.cursor() | |
| insert_query = """ | |
| INSERT INTO research_pro_chat_v2 (user_id, user_query, subtopic_query, response, html_report, created_at) | |
| VALUES (%s, %s, %s, %s, %s, %s); | |
| """ | |
| cur.execute(insert_query, (user_id,user_query, subtopic_query, response, html_report, datetime.now())) | |
| conn.commit() | |
| cur.close() | |
| conn.close() | |
| ####-----------------------------------------------------END----------------------------------------------------------#### | |
| import ast | |
| from fpdf import FPDF | |
| import re | |
| import pandas as pd | |
| import nltk | |
| import requests | |
| import json | |
| from retry import retry | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from bs4 import BeautifulSoup | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from brave import Brave | |
| from fuzzy_json import loads | |
| from half_json.core import JSONFixer | |
| from openai import OpenAI | |
| from together import Together | |
| llm_default_small = "meta-llama/Llama-3-8b-chat-hf" | |
| llm_default_medium = "meta-llama/Llama-3-70b-chat-hf" | |
| SysPromptData = "You are an information retriever and summarizer, return only the factual information regarding the user query" | |
| SysPromptDefault = "You are an expert AI, complete the given task. Do not add any additional comments." | |
| import tiktoken # Used to limit tokens | |
| encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # Instead of Llama3 using available option/ replace if found anything better | |
| def limit_tokens(input_string, token_limit=7500): | |
| """ | |
| Limit tokens sent to the model | |
| """ | |
| return encoding.decode(encoding.encode(input_string)[:token_limit]) | |
| together_client = OpenAI( | |
| api_key=TOGETHER_API_KEY, | |
| base_url="https://together.hconeai.com/v1", | |
| default_headers={ "Helicone-Auth": f"Bearer {HELICON_API_KEY}"}) | |
| groq_client = OpenAI( | |
| api_key=GROQ_API_KEY, | |
| base_url="https://groq.hconeai.com/openai/v1", | |
| default_headers={ "Helicone-Auth": f"Bearer {HELICON_API_KEY}"}) | |
| # Groq model names | |
| llm_default_small = "llama3-8b-8192" | |
| llm_default_medium = "llama3-70b-8192" | |
| # Together Model names (fallback) | |
| llm_fallback_small = "meta-llama/Llama-3-8b-chat-hf" | |
| llm_fallback_medium = "meta-llama/Llama-3-70b-chat-hf" | |
| ### ------END OF LLM CONFIG-------- ### | |
| def together_response(message, model = llm_default_small, SysPrompt = SysPromptDefault, temperature=0.2, frequency_penalty =0.1, max_tokens= 2000): | |
| messages=[{"role": "system", "content": SysPrompt},{"role": "user", "content": message}] | |
| params = { | |
| "model": model, | |
| "messages": messages, | |
| "temperature": temperature, | |
| "frequency_penalty": frequency_penalty, | |
| "max_tokens": max_tokens | |
| } | |
| try: | |
| response = groq_client.chat.completions.create(**params) | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| print(f"Error calling GROQ API: {e}") | |
| params["model"] = llm_fallback_small if model == llm_default_small else llm_fallback_medium | |
| response = together_client.chat.completions.create(**params) | |
| return response.choices[0].message.content | |
| def json_from_text(text): | |
| """ | |
| Extracts JSON from text using regex and fuzzy JSON loading. | |
| """ | |
| try: | |
| return json.loads(text) | |
| except: | |
| match = re.search(r'\{[\s\S]*\}', text) | |
| if match: | |
| json_out = match.group(0) | |
| else: | |
| json_out = text | |
| # Use Fuzzy JSON loading | |
| return loads(json_out) | |
| def remove_stopwords(text): | |
| stop_words = set(stopwords.words('english')) | |
| words = word_tokenize(text) | |
| filtered_text = [word for word in words if word.lower() not in stop_words] | |
| return ' '.join(filtered_text) | |
| def rephrase_content(data_format, content, query): | |
| if data_format == "Structured data": | |
| return together_response( | |
| f"return only the factual information regarding the query: {{{query}}}. Output should be concise chunks of \ | |
| paragraphs or tables or both, using the scraped context:{{{limit_tokens(content)}}}", | |
| SysPrompt=SysPromptData, | |
| max_tokens=500, | |
| ) | |
| elif data_format == "Quantitative data": | |
| return together_response( | |
| f"return only the numerical or quantitative data regarding the query: {{{query}}} structured into .md tables, using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}", | |
| SysPrompt=SysPromptData, | |
| max_tokens=500, | |
| ) | |
| else: | |
| return together_response( | |
| f"return only the factual information regarding the query: {{{query}}} using the scraped context:{{{limit_tokens(content,token_limit=1000)}}}", | |
| SysPrompt=SysPromptData, | |
| max_tokens=500, | |
| ) | |
| class Scraper: | |
| def __init__(self, user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"): | |
| self.session = requests.Session() | |
| self.session.headers.update({"User-Agent": user_agent}) | |
| def fetch_content(self, url): | |
| try: | |
| response = self.session.get(url, timeout=2) | |
| if response.status_code == 200: | |
| return response.text | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error fetching page content for {url}: {e}") | |
| return None | |
| def extract_main_content(html): | |
| if html: | |
| plain_text = "" | |
| soup = BeautifulSoup(html, 'lxml') | |
| for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']): | |
| plain_text += element.get_text(separator=" ", strip=True) + "\n" | |
| return plain_text | |
| return "" | |
| def process_content(data_format, url, query): | |
| scraper = Scraper() | |
| html_content = scraper.fetch_content(url) | |
| if html_content: | |
| content = extract_main_content(html_content) | |
| if content: | |
| rephrased_content = rephrase_content( | |
| data_format=data_format, | |
| content=limit_tokens(remove_stopwords(content), token_limit=1000), | |
| query=query, | |
| ) | |
| return rephrased_content, url | |
| return "", url | |
| def fetch_and_extract_content(data_format, urls, query): | |
| with ThreadPoolExecutor(max_workers=len(urls)) as executor: | |
| future_to_url = { | |
| executor.submit(process_content, data_format, url, query): url | |
| for url in urls | |
| } | |
| all_text_with_urls = [future.result() for future in as_completed(future_to_url)] | |
| return all_text_with_urls | |
| def search_brave(query, num_results=5): | |
| brave = Brave(BRAVE_API_KEY) | |
| search_results = brave.search(q=query, count=num_results) | |
| return [url.__str__() for url in search_results.urls] | |