Spaces:
Sleeping
Sleeping
jesusgj
commited on
Commit
·
07e3a65
1
Parent(s):
9fb8366
Modified files
Browse files- agent.py +106 -45
- requirements.txt +2 -1
agent.py
CHANGED
|
@@ -2,15 +2,20 @@ import os
|
|
| 2 |
import re
|
| 3 |
import requests
|
| 4 |
import serpapi
|
|
|
|
| 5 |
from smolagents import CodeAgent, ToolCallingAgent, WebSearchTool, tool
|
| 6 |
from smolagents import InferenceClientModel
|
| 7 |
from dotenv import load_dotenv
|
| 8 |
from markdownify import markdownify
|
| 9 |
from requests.exceptions import RequestException
|
| 10 |
from llama_index.core import VectorStoreIndex, download_loader
|
|
|
|
|
|
|
| 11 |
|
| 12 |
search_cache = {}
|
| 13 |
webpage_cache = {}
|
|
|
|
|
|
|
| 14 |
|
| 15 |
def initialize_agent():
|
| 16 |
# Load environment variables from .env file
|
|
@@ -18,7 +23,7 @@ def initialize_agent():
|
|
| 18 |
|
| 19 |
# 1. Load the model
|
| 20 |
# Make sure to set HF_TOKEN in your Hugging Face Space secrets
|
| 21 |
-
model_name = "mistralai/Mixtral-
|
| 22 |
try:
|
| 23 |
model = InferenceClientModel(model_id=model_name, token=os.environ.get("HF_TOKEN"), provider="together")
|
| 24 |
except Exception as e:
|
|
@@ -39,18 +44,59 @@ def initialize_agent():
|
|
| 39 |
"""
|
| 40 |
if (url, query) in webpage_cache:
|
| 41 |
return webpage_cache[(url, query)]
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
|
| 55 |
@tool
|
| 56 |
def google_search(query: str) -> str:
|
|
@@ -64,40 +110,47 @@ def initialize_agent():
|
|
| 64 |
"""
|
| 65 |
if query in search_cache:
|
| 66 |
return search_cache[query]
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
output +=
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
# 3. Define the agents
|
| 98 |
if model:
|
| 99 |
web_agent = ToolCallingAgent(
|
| 100 |
-
tools=[WebSearchTool(), query_webpage, google_search],
|
| 101 |
model=model,
|
| 102 |
max_steps=10,
|
| 103 |
name="web_search_agent",
|
|
@@ -108,8 +161,16 @@ def initialize_agent():
|
|
| 108 |
tools=[],
|
| 109 |
model=model,
|
| 110 |
managed_agents=[web_agent],
|
| 111 |
-
additional_authorized_imports=["time", "numpy", "pandas", "requests", "serpapi", "llama_index", "beautifulsoup4", "markdownify", "lxml", "json", "urllib.parse"],
|
| 112 |
-
instructions='''You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the a new line and the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
)
|
| 114 |
return manager_agent
|
| 115 |
else:
|
|
|
|
| 2 |
import re
|
| 3 |
import requests
|
| 4 |
import serpapi
|
| 5 |
+
import time
|
| 6 |
from smolagents import CodeAgent, ToolCallingAgent, WebSearchTool, tool
|
| 7 |
from smolagents import InferenceClientModel
|
| 8 |
from dotenv import load_dotenv
|
| 9 |
from markdownify import markdownify
|
| 10 |
from requests.exceptions import RequestException
|
| 11 |
from llama_index.core import VectorStoreIndex, download_loader
|
| 12 |
+
from llama_index.core.schema import Document
|
| 13 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
| 14 |
|
| 15 |
search_cache = {}
|
| 16 |
webpage_cache = {}
|
| 17 |
+
MAX_RETRIES = 3
|
| 18 |
+
INITIAL_DELAY = 1 # seconds
|
| 19 |
|
| 20 |
def initialize_agent():
|
| 21 |
# Load environment variables from .env file
|
|
|
|
| 23 |
|
| 24 |
# 1. Load the model
|
| 25 |
# Make sure to set HF_TOKEN in your Hugging Face Space secrets
|
| 26 |
+
model_name = "mistralai/Mixtral-8x22B-Instruct-v0.1"
|
| 27 |
try:
|
| 28 |
model = InferenceClientModel(model_id=model_name, token=os.environ.get("HF_TOKEN"), provider="together")
|
| 29 |
except Exception as e:
|
|
|
|
| 44 |
"""
|
| 45 |
if (url, query) in webpage_cache:
|
| 46 |
return webpage_cache[(url, query)]
|
| 47 |
+
|
| 48 |
+
for i in range(MAX_RETRIES):
|
| 49 |
+
try:
|
| 50 |
+
BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
|
| 51 |
+
loader = BeautifulSoupWebReader()
|
| 52 |
+
documents = loader.load_data(urls=[url])
|
| 53 |
+
index = VectorStoreIndex.from_documents(documents)
|
| 54 |
+
query_engine = index.as_query_engine()
|
| 55 |
+
response = query_engine.query(query)
|
| 56 |
+
webpage_cache[(url, query)] = str(response)
|
| 57 |
+
return str(response)
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
if i < MAX_RETRIES - 1:
|
| 61 |
+
delay = INITIAL_DELAY * (2 ** i)
|
| 62 |
+
print(f"Error querying webpage: {str(e)}. Retrying in {delay} seconds...")
|
| 63 |
+
time.sleep(delay)
|
| 64 |
+
else:
|
| 65 |
+
return f"An unexpected error occurred after multiple retries: {str(e)}"
|
| 66 |
+
|
| 67 |
+
@tool
|
| 68 |
+
def query_youtube_video(video_id: str, query: str) -> str:
|
| 69 |
+
"""Queries a YouTube video's transcript to find specific information and returns a concise answer.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
video_id: The ID of the YouTube video.
|
| 73 |
+
query: The specific question to ask about the content of the video transcript.
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
A concise answer to the query based on the video transcript, or an error message.
|
| 77 |
+
"""
|
| 78 |
+
if (video_id, query) in webpage_cache: # Using webpage_cache for simplicity
|
| 79 |
+
return webpage_cache[(video_id, query)]
|
| 80 |
+
|
| 81 |
+
for i in range(MAX_RETRIES):
|
| 82 |
+
try:
|
| 83 |
+
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
|
| 84 |
+
transcript_text = " ".join([t['text'] for t in transcript_list])
|
| 85 |
+
|
| 86 |
+
documents = [Document(text=transcript_text)]
|
| 87 |
+
index = VectorStoreIndex.from_documents(documents)
|
| 88 |
+
query_engine = index.as_query_engine()
|
| 89 |
+
response = query_engine.query(query)
|
| 90 |
+
webpage_cache[(video_id, query)] = str(response)
|
| 91 |
+
return str(response)
|
| 92 |
+
|
| 93 |
+
except Exception as e:
|
| 94 |
+
if i < MAX_RETRIES - 1:
|
| 95 |
+
delay = INITIAL_DELAY * (2 ** i)
|
| 96 |
+
print(f"Error querying YouTube video: {str(e)}. Retrying in {delay} seconds...")
|
| 97 |
+
time.sleep(delay)
|
| 98 |
+
else:
|
| 99 |
+
return f"An unexpected error occurred after multiple retries: {str(e)}"
|
| 100 |
|
| 101 |
@tool
|
| 102 |
def google_search(query: str) -> str:
|
|
|
|
| 110 |
"""
|
| 111 |
if query in search_cache:
|
| 112 |
return search_cache[query]
|
| 113 |
+
|
| 114 |
+
for i in range(MAX_RETRIES):
|
| 115 |
+
try:
|
| 116 |
+
client = serpapi.Client(api_key=os.environ.get("SERPAPI_API_KEY"))
|
| 117 |
+
results = client.search(q=query, engine="google")
|
| 118 |
+
if "ai_overview" in results:
|
| 119 |
+
ai_overview = results["ai_overview"]
|
| 120 |
+
output = ""
|
| 121 |
+
for block in ai_overview.get("text_blocks", []):
|
| 122 |
+
if block["type"] == "paragraph":
|
| 123 |
+
output += block["snippet"] + "\n\n"
|
| 124 |
+
elif block["type"] == "heading":
|
| 125 |
+
output += f"### {block['snippet']}\n\n"
|
| 126 |
+
elif block["type"] == "list":
|
| 127 |
+
for item in block["list"]:
|
| 128 |
+
output += f"- **{item['title']}** {item['snippet']}\n"
|
| 129 |
+
output += "\n"
|
| 130 |
+
if "references" in ai_overview:
|
| 131 |
+
output += "\n**References:**\n"
|
| 132 |
+
for ref in ai_overview["references"]:
|
| 133 |
+
output += f"- [{ref['title']}]({ref['link']})\n"
|
| 134 |
+
search_cache[query] = output
|
| 135 |
+
return output
|
| 136 |
+
elif "organic_results" in results:
|
| 137 |
+
result = str(results["organic_results"])
|
| 138 |
+
search_cache[query] = result
|
| 139 |
+
return result
|
| 140 |
+
else:
|
| 141 |
+
return "No results found."
|
| 142 |
+
except Exception as e:
|
| 143 |
+
if i < MAX_RETRIES - 1:
|
| 144 |
+
delay = INITIAL_DELAY * (2 ** i)
|
| 145 |
+
print(f"Error performing Google search: {str(e)}. Retrying in {delay} seconds...")
|
| 146 |
+
time.sleep(delay)
|
| 147 |
+
else:
|
| 148 |
+
return f"Error performing Google search after multiple retries: {str(e)}"
|
| 149 |
|
| 150 |
# 3. Define the agents
|
| 151 |
if model:
|
| 152 |
web_agent = ToolCallingAgent(
|
| 153 |
+
tools=[WebSearchTool(), query_webpage, query_youtube_video, google_search],
|
| 154 |
model=model,
|
| 155 |
max_steps=10,
|
| 156 |
name="web_search_agent",
|
|
|
|
| 161 |
tools=[],
|
| 162 |
model=model,
|
| 163 |
managed_agents=[web_agent],
|
| 164 |
+
additional_authorized_imports=["time", "numpy", "pandas", "requests", "serpapi", "llama_index", "beautifulsoup4", "markdownify", "lxml", "json", "urllib.parse", "youtube_transcript_api"],
|
| 165 |
+
instructions='''You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the a new line and the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
| 166 |
+
|
| 167 |
+
To achieve the best results, follow these steps:
|
| 168 |
+
1. **Understand the Question:** Carefully read and analyze the user's question to identify the core task and any specific constraints (e.g., format, type of answer).
|
| 169 |
+
2. **Formulate a Plan:** Based on the question, devise a step-by-step plan. This might involve using web search, querying webpages, or analyzing YouTube videos. Consider what information is needed and which tool is best suited to obtain it.
|
| 170 |
+
3. **Execute Tools:** Use the available tools (WebSearchTool, query_webpage, query_youtube_video, google_search) to gather the necessary information. Be mindful of rate limits and use caching effectively.
|
| 171 |
+
4. **Synthesize Information:** Combine and process the information obtained from the tools to formulate a comprehensive answer. If the question requires specific data extraction, ensure accuracy.
|
| 172 |
+
5. **Format the Final Answer:** Adhere strictly to the specified FINAL ANSWER template. Ensure the answer type (number, string, comma-separated list) matches the question's requirement.
|
| 173 |
+
6. **Self-Correction:** If initial attempts fail or produce unsatisfactory results, re-evaluate the plan and try alternative approaches or tools.'''
|
| 174 |
)
|
| 175 |
return manager_agent
|
| 176 |
else:
|
requirements.txt
CHANGED
|
@@ -7,4 +7,5 @@ markdownify
|
|
| 7 |
duckduckgo-search
|
| 8 |
wikipedia
|
| 9 |
serpapi
|
| 10 |
-
llama-index
|
|
|
|
|
|
| 7 |
duckduckgo-search
|
| 8 |
wikipedia
|
| 9 |
serpapi
|
| 10 |
+
llama-index
|
| 11 |
+
youtube-transcript-api
|