jesusgj commited on
Commit
07e3a65
·
1 Parent(s): 9fb8366

Modified files

Browse files
Files changed (2) hide show
  1. agent.py +106 -45
  2. requirements.txt +2 -1
agent.py CHANGED
@@ -2,15 +2,20 @@ import os
2
  import re
3
  import requests
4
  import serpapi
 
5
  from smolagents import CodeAgent, ToolCallingAgent, WebSearchTool, tool
6
  from smolagents import InferenceClientModel
7
  from dotenv import load_dotenv
8
  from markdownify import markdownify
9
  from requests.exceptions import RequestException
10
  from llama_index.core import VectorStoreIndex, download_loader
 
 
11
 
12
  search_cache = {}
13
  webpage_cache = {}
 
 
14
 
15
  def initialize_agent():
16
  # Load environment variables from .env file
@@ -18,7 +23,7 @@ def initialize_agent():
18
 
19
  # 1. Load the model
20
  # Make sure to set HF_TOKEN in your Hugging Face Space secrets
21
- model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
22
  try:
23
  model = InferenceClientModel(model_id=model_name, token=os.environ.get("HF_TOKEN"), provider="together")
24
  except Exception as e:
@@ -39,18 +44,59 @@ def initialize_agent():
39
  """
40
  if (url, query) in webpage_cache:
41
  return webpage_cache[(url, query)]
42
- try:
43
- BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
44
- loader = BeautifulSoupWebReader()
45
- documents = loader.load_data(urls=[url])
46
- index = VectorStoreIndex.from_documents(documents)
47
- query_engine = index.as_query_engine()
48
- response = query_engine.query(query)
49
- webpage_cache[(url, query)] = str(response)
50
- return str(response)
51
-
52
- except Exception as e:
53
- return f"An unexpected error occurred: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  @tool
56
  def google_search(query: str) -> str:
@@ -64,40 +110,47 @@ def initialize_agent():
64
  """
65
  if query in search_cache:
66
  return search_cache[query]
67
- try:
68
- client = serpapi.Client(api_key=os.environ.get("SERPAPI_API_KEY"))
69
- results = client.search(q=query, engine="google")
70
- if "ai_overview" in results:
71
- ai_overview = results["ai_overview"]
72
- output = ""
73
- for block in ai_overview.get("text_blocks", []):
74
- if block["type"] == "paragraph":
75
- output += block["snippet"] + "\n\n"
76
- elif block["type"] == "heading":
77
- output += f"### {block['snippet']}\n\n"
78
- elif block["type"] == "list":
79
- for item in block["list"]:
80
- output += f"- **{item['title']}** {item['snippet']}\n"
81
- output += "\n"
82
- if "references" in ai_overview:
83
- output += "\n**References:**\n"
84
- for ref in ai_overview["references"]:
85
- output += f"- [{ref['title']}]({ref['link']})\n"
86
- search_cache[query] = output
87
- return output
88
- elif "organic_results" in results:
89
- result = str(results["organic_results"])
90
- search_cache[query] = result
91
- return result
92
- else:
93
- return "No results found."
94
- except Exception as e:
95
- return f"Error performing Google search: {str(e)}"
 
 
 
 
 
 
 
96
 
97
  # 3. Define the agents
98
  if model:
99
  web_agent = ToolCallingAgent(
100
- tools=[WebSearchTool(), query_webpage, google_search],
101
  model=model,
102
  max_steps=10,
103
  name="web_search_agent",
@@ -108,8 +161,16 @@ def initialize_agent():
108
  tools=[],
109
  model=model,
110
  managed_agents=[web_agent],
111
- additional_authorized_imports=["time", "numpy", "pandas", "requests", "serpapi", "llama_index", "beautifulsoup4", "markdownify", "lxml", "json", "urllib.parse"],
112
- instructions='''You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the a new line and the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.'''
 
 
 
 
 
 
 
 
113
  )
114
  return manager_agent
115
  else:
 
2
  import re
3
  import requests
4
  import serpapi
5
+ import time
6
  from smolagents import CodeAgent, ToolCallingAgent, WebSearchTool, tool
7
  from smolagents import InferenceClientModel
8
  from dotenv import load_dotenv
9
  from markdownify import markdownify
10
  from requests.exceptions import RequestException
11
  from llama_index.core import VectorStoreIndex, download_loader
12
+ from llama_index.core.schema import Document
13
+ from youtube_transcript_api import YouTubeTranscriptApi
14
 
15
  search_cache = {}
16
  webpage_cache = {}
17
+ MAX_RETRIES = 3
18
+ INITIAL_DELAY = 1 # seconds
19
 
20
  def initialize_agent():
21
  # Load environment variables from .env file
 
23
 
24
  # 1. Load the model
25
  # Make sure to set HF_TOKEN in your Hugging Face Space secrets
26
+ model_name = "mistralai/Mixtral-8x22B-Instruct-v0.1"
27
  try:
28
  model = InferenceClientModel(model_id=model_name, token=os.environ.get("HF_TOKEN"), provider="together")
29
  except Exception as e:
 
44
  """
45
  if (url, query) in webpage_cache:
46
  return webpage_cache[(url, query)]
47
+
48
+ for i in range(MAX_RETRIES):
49
+ try:
50
+ BeautifulSoupWebReader = download_loader("BeautifulSoupWebReader")
51
+ loader = BeautifulSoupWebReader()
52
+ documents = loader.load_data(urls=[url])
53
+ index = VectorStoreIndex.from_documents(documents)
54
+ query_engine = index.as_query_engine()
55
+ response = query_engine.query(query)
56
+ webpage_cache[(url, query)] = str(response)
57
+ return str(response)
58
+
59
+ except Exception as e:
60
+ if i < MAX_RETRIES - 1:
61
+ delay = INITIAL_DELAY * (2 ** i)
62
+ print(f"Error querying webpage: {str(e)}. Retrying in {delay} seconds...")
63
+ time.sleep(delay)
64
+ else:
65
+ return f"An unexpected error occurred after multiple retries: {str(e)}"
66
+
67
+ @tool
68
+ def query_youtube_video(video_id: str, query: str) -> str:
69
+ """Queries a YouTube video's transcript to find specific information and returns a concise answer.
70
+
71
+ Args:
72
+ video_id: The ID of the YouTube video.
73
+ query: The specific question to ask about the content of the video transcript.
74
+
75
+ Returns:
76
+ A concise answer to the query based on the video transcript, or an error message.
77
+ """
78
+ if (video_id, query) in webpage_cache: # Using webpage_cache for simplicity
79
+ return webpage_cache[(video_id, query)]
80
+
81
+ for i in range(MAX_RETRIES):
82
+ try:
83
+ transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
84
+ transcript_text = " ".join([t['text'] for t in transcript_list])
85
+
86
+ documents = [Document(text=transcript_text)]
87
+ index = VectorStoreIndex.from_documents(documents)
88
+ query_engine = index.as_query_engine()
89
+ response = query_engine.query(query)
90
+ webpage_cache[(video_id, query)] = str(response)
91
+ return str(response)
92
+
93
+ except Exception as e:
94
+ if i < MAX_RETRIES - 1:
95
+ delay = INITIAL_DELAY * (2 ** i)
96
+ print(f"Error querying YouTube video: {str(e)}. Retrying in {delay} seconds...")
97
+ time.sleep(delay)
98
+ else:
99
+ return f"An unexpected error occurred after multiple retries: {str(e)}"
100
 
101
  @tool
102
  def google_search(query: str) -> str:
 
110
  """
111
  if query in search_cache:
112
  return search_cache[query]
113
+
114
+ for i in range(MAX_RETRIES):
115
+ try:
116
+ client = serpapi.Client(api_key=os.environ.get("SERPAPI_API_KEY"))
117
+ results = client.search(q=query, engine="google")
118
+ if "ai_overview" in results:
119
+ ai_overview = results["ai_overview"]
120
+ output = ""
121
+ for block in ai_overview.get("text_blocks", []):
122
+ if block["type"] == "paragraph":
123
+ output += block["snippet"] + "\n\n"
124
+ elif block["type"] == "heading":
125
+ output += f"### {block['snippet']}\n\n"
126
+ elif block["type"] == "list":
127
+ for item in block["list"]:
128
+ output += f"- **{item['title']}** {item['snippet']}\n"
129
+ output += "\n"
130
+ if "references" in ai_overview:
131
+ output += "\n**References:**\n"
132
+ for ref in ai_overview["references"]:
133
+ output += f"- [{ref['title']}]({ref['link']})\n"
134
+ search_cache[query] = output
135
+ return output
136
+ elif "organic_results" in results:
137
+ result = str(results["organic_results"])
138
+ search_cache[query] = result
139
+ return result
140
+ else:
141
+ return "No results found."
142
+ except Exception as e:
143
+ if i < MAX_RETRIES - 1:
144
+ delay = INITIAL_DELAY * (2 ** i)
145
+ print(f"Error performing Google search: {str(e)}. Retrying in {delay} seconds...")
146
+ time.sleep(delay)
147
+ else:
148
+ return f"Error performing Google search after multiple retries: {str(e)}"
149
 
150
  # 3. Define the agents
151
  if model:
152
  web_agent = ToolCallingAgent(
153
+ tools=[WebSearchTool(), query_webpage, query_youtube_video, google_search],
154
  model=model,
155
  max_steps=10,
156
  name="web_search_agent",
 
161
  tools=[],
162
  model=model,
163
  managed_agents=[web_agent],
164
+ additional_authorized_imports=["time", "numpy", "pandas", "requests", "serpapi", "llama_index", "beautifulsoup4", "markdownify", "lxml", "json", "urllib.parse", "youtube_transcript_api"],
165
+ instructions='''You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the a new line and the following template: FINAL ANSWER: [YOUR FINAL ANSWER]. YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
166
+
167
+ To achieve the best results, follow these steps:
168
+ 1. **Understand the Question:** Carefully read and analyze the user's question to identify the core task and any specific constraints (e.g., format, type of answer).
169
+ 2. **Formulate a Plan:** Based on the question, devise a step-by-step plan. This might involve using web search, querying webpages, or analyzing YouTube videos. Consider what information is needed and which tool is best suited to obtain it.
170
+ 3. **Execute Tools:** Use the available tools (WebSearchTool, query_webpage, query_youtube_video, google_search) to gather the necessary information. Be mindful of rate limits and use caching effectively.
171
+ 4. **Synthesize Information:** Combine and process the information obtained from the tools to formulate a comprehensive answer. If the question requires specific data extraction, ensure accuracy.
172
+ 5. **Format the Final Answer:** Adhere strictly to the specified FINAL ANSWER template. Ensure the answer type (number, string, comma-separated list) matches the question's requirement.
173
+ 6. **Self-Correction:** If initial attempts fail or produce unsatisfactory results, re-evaluate the plan and try alternative approaches or tools.'''
174
  )
175
  return manager_agent
176
  else:
requirements.txt CHANGED
@@ -7,4 +7,5 @@ markdownify
7
  duckduckgo-search
8
  wikipedia
9
  serpapi
10
- llama-index
 
 
7
  duckduckgo-search
8
  wikipedia
9
  serpapi
10
+ llama-index
11
+ youtube-transcript-api