Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,11 +1,14 @@
|
|
| 1 |
-
from fastapi import FastAPI, HTTPException, Query
|
| 2 |
from fastapi.responses import JSONResponse
|
| 3 |
from webscout import WEBS, transcriber, LLM
|
| 4 |
-
from typing import Optional, List, Dict, Union
|
| 5 |
from fastapi.encoders import jsonable_encoder
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
import requests
|
| 8 |
import urllib.parse
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
app = FastAPI()
|
| 11 |
|
|
@@ -152,6 +155,21 @@ def extract_text_from_webpage(html_content):
|
|
| 152 |
visible_text = soup.get_text(strip=True)
|
| 153 |
return visible_text
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
@app.get("/api/web_extract")
|
| 156 |
async def web_extract(
|
| 157 |
url: str,
|
|
@@ -159,12 +177,8 @@ async def web_extract(
|
|
| 159 |
):
|
| 160 |
"""Extracts text from a given URL."""
|
| 161 |
try:
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
visible_text = extract_text_from_webpage(response.text)
|
| 165 |
-
if len(visible_text) > max_chars:
|
| 166 |
-
visible_text = visible_text[:max_chars] + "..."
|
| 167 |
-
return {"url": url, "text": visible_text}
|
| 168 |
except requests.exceptions.RequestException as e:
|
| 169 |
raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
|
| 170 |
|
|
@@ -188,23 +202,10 @@ async def web_search_and_extract(
|
|
| 188 |
search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
|
| 189 |
timelimit=timelimit, backend=backend, max_results=max_results)
|
| 190 |
|
| 191 |
-
# Extract text from each result's link
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
link = result['href']
|
| 196 |
-
try:
|
| 197 |
-
response = requests.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
|
| 198 |
-
response.raise_for_status()
|
| 199 |
-
visible_text = extract_text_from_webpage(response.text)
|
| 200 |
-
if len(visible_text) > max_chars:
|
| 201 |
-
visible_text = visible_text[:max_chars] + "..."
|
| 202 |
-
extracted_results.append({"link": link, "text": visible_text})
|
| 203 |
-
except requests.exceptions.RequestException as e:
|
| 204 |
-
print(f"Error fetching or processing {link}: {e}")
|
| 205 |
-
extracted_results.append({"link": link, "text": None})
|
| 206 |
-
else:
|
| 207 |
-
extracted_results.append({"link": None, "text": None})
|
| 208 |
if extract_only:
|
| 209 |
return JSONResponse(content=jsonable_encoder({extracted_results}))
|
| 210 |
else:
|
|
@@ -235,22 +236,13 @@ async def adv_web_search(
|
|
| 235 |
timelimit=timelimit, backend=backend,
|
| 236 |
max_results=max_results)
|
| 237 |
|
| 238 |
-
# 2. Extract text from top search result URLs
|
| 239 |
extracted_text = ""
|
| 240 |
-
for result in search_results
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
response.raise_for_status()
|
| 246 |
-
visible_text = extract_text_from_webpage(response.text)
|
| 247 |
-
if len(visible_text) > max_chars:
|
| 248 |
-
visible_text = visible_text[:max_chars] + "..."
|
| 249 |
-
extracted_text += f"## Content from: {link}\n\n{visible_text}\n\n"
|
| 250 |
-
except requests.exceptions.RequestException as e:
|
| 251 |
-
print(f"Error fetching or processing {link}: {e}")
|
| 252 |
-
else:
|
| 253 |
-
pass
|
| 254 |
|
| 255 |
# 3. Construct the prompt for the LLM
|
| 256 |
llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"
|
|
|
|
| 1 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 2 |
from fastapi.responses import JSONResponse
|
| 3 |
from webscout import WEBS, transcriber, LLM
|
| 4 |
+
from typing import Optional, List, Dict, Union
|
| 5 |
from fastapi.encoders import jsonable_encoder
|
| 6 |
from bs4 import BeautifulSoup
|
| 7 |
import requests
|
| 8 |
import urllib.parse
|
| 9 |
+
import asyncio
|
| 10 |
+
import aiohttp
|
| 11 |
+
from typing import List
|
| 12 |
|
| 13 |
app = FastAPI()
|
| 14 |
|
|
|
|
| 155 |
visible_text = soup.get_text(strip=True)
|
| 156 |
return visible_text
|
| 157 |
|
| 158 |
+
async def fetch_and_extract(url, max_chars):
|
| 159 |
+
"""Fetches a URL and extracts text asynchronously."""
|
| 160 |
+
async with aiohttp.ClientSession() as session:
|
| 161 |
+
try:
|
| 162 |
+
async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
|
| 163 |
+
response.raise_for_status()
|
| 164 |
+
html_content = await response.text()
|
| 165 |
+
visible_text = extract_text_from_webpage(html_content)
|
| 166 |
+
if len(visible_text) > max_chars:
|
| 167 |
+
visible_text = visible_text[:max_chars] + "..."
|
| 168 |
+
return {"link": url, "text": visible_text}
|
| 169 |
+
except (aiohttp.ClientError, requests.exceptions.RequestException) as e:
|
| 170 |
+
print(f"Error fetching or processing {url}: {e}")
|
| 171 |
+
return {"link": url, "text": None}
|
| 172 |
+
|
| 173 |
@app.get("/api/web_extract")
|
| 174 |
async def web_extract(
|
| 175 |
url: str,
|
|
|
|
| 177 |
):
|
| 178 |
"""Extracts text from a given URL."""
|
| 179 |
try:
|
| 180 |
+
result = await fetch_and_extract(url, max_chars)
|
| 181 |
+
return {"url": url, "text": result["text"]}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
except requests.exceptions.RequestException as e:
|
| 183 |
raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
|
| 184 |
|
|
|
|
| 202 |
search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
|
| 203 |
timelimit=timelimit, backend=backend, max_results=max_results)
|
| 204 |
|
| 205 |
+
# Extract text from each result's link asynchronously
|
| 206 |
+
tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
|
| 207 |
+
extracted_results = await asyncio.gather(*tasks)
|
| 208 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
if extract_only:
|
| 210 |
return JSONResponse(content=jsonable_encoder({extracted_results}))
|
| 211 |
else:
|
|
|
|
| 236 |
timelimit=timelimit, backend=backend,
|
| 237 |
max_results=max_results)
|
| 238 |
|
| 239 |
+
# 2. Extract text from top search result URLs asynchronously
|
| 240 |
extracted_text = ""
|
| 241 |
+
tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
|
| 242 |
+
extracted_results = await asyncio.gather(*tasks)
|
| 243 |
+
for result in extracted_results:
|
| 244 |
+
if result['text']:
|
| 245 |
+
extracted_text += f"## Content from: {result['link']}\n\n{result['text']}\n\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
# 3. Construct the prompt for the LLM
|
| 248 |
llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"
|