Spaces:

OEvortex
/

Webscout-API

Paused

App Files Files Community

KingNish commited on Jul 1, 2024

Commit

775d6d5

verified ·

1 Parent(s): d4b4fb1

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -40

app.py CHANGED Viewed

@@ -1,11 +1,14 @@
-from fastapi import FastAPI, HTTPException, Query  # Make sure Query is imported
 from fastapi.responses import JSONResponse
 from webscout import WEBS, transcriber, LLM
-from typing import Optional, List, Dict, Union # Import List, Dict, Union
 from fastapi.encoders import jsonable_encoder
 from bs4 import BeautifulSoup
 import requests
 import urllib.parse
 app = FastAPI()
@@ -152,6 +155,21 @@ def extract_text_from_webpage(html_content):
     visible_text = soup.get_text(strip=True)
     return visible_text
 @app.get("/api/web_extract")
 async def web_extract(
     url: str,
@@ -159,12 +177,8 @@ async def web_extract(
 ):
     """Extracts text from a given URL."""
     try:
-        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
-        response.raise_for_status()
-        visible_text = extract_text_from_webpage(response.text)
-        if len(visible_text) > max_chars:
-            visible_text = visible_text[:max_chars] + "..."
-        return {"url": url, "text": visible_text}
     except requests.exceptions.RequestException as e:
         raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
@@ -188,23 +202,10 @@ async def web_search_and_extract(
             search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
                                      timelimit=timelimit, backend=backend, max_results=max_results)
-            # Extract text from each result's link
-            extracted_results = []
-            for result in search_results:
-                if 'href' in result:
-                    link = result['href']
-                    try:
-                        response = requests.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
-                        response.raise_for_status()
-                        visible_text = extract_text_from_webpage(response.text)
-                        if len(visible_text) > max_chars:
-                            visible_text = visible_text[:max_chars] + "..."
-                        extracted_results.append({"link": link, "text": visible_text})
-                    except requests.exceptions.RequestException as e:
-                        print(f"Error fetching or processing {link}: {e}")
-                        extracted_results.append({"link": link, "text": None})
-                else:
-                    extracted_results.append({"link": None, "text": None})
             if extract_only:
                 return JSONResponse(content=jsonable_encoder({extracted_results}))
             else:
@@ -235,22 +236,13 @@ async def adv_web_search(
                                      timelimit=timelimit, backend=backend,
                                      max_results=max_results)
-            # 2. Extract text from top search result URLs
             extracted_text = ""
-            for result in search_results:
-                if 'href' in result:
-                    link = result['href']
-                    try:
-                        response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
-                        response.raise_for_status()
-                        visible_text = extract_text_from_webpage(response.text)
-                        if len(visible_text) > max_chars:
-                            visible_text = visible_text[:max_chars] + "..."
-                        extracted_text += f"## Content from: {link}\n\n{visible_text}\n\n"
-                    except requests.exceptions.RequestException as e:
-                        print(f"Error fetching or processing {link}: {e}")
-                else:
-                   pass
         # 3. Construct the prompt for the LLM
         llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"

+from fastapi import FastAPI, HTTPException, Query
 from fastapi.responses import JSONResponse
 from webscout import WEBS, transcriber, LLM
+from typing import Optional, List, Dict, Union
 from fastapi.encoders import jsonable_encoder
 from bs4 import BeautifulSoup
 import requests
 import urllib.parse
+import asyncio
+import aiohttp
+from typing import List
 app = FastAPI()
     visible_text = soup.get_text(strip=True)
     return visible_text
+async def fetch_and_extract(url, max_chars):
+    """Fetches a URL and extracts text asynchronously."""
+    async with aiohttp.ClientSession() as session:
+        try:
+            async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
+                response.raise_for_status()
+                html_content = await response.text()
+                visible_text = extract_text_from_webpage(html_content)
+                if len(visible_text) > max_chars:
+                    visible_text = visible_text[:max_chars] + "..."
+                return {"link": url, "text": visible_text}
+        except (aiohttp.ClientError, requests.exceptions.RequestException) as e:
+            print(f"Error fetching or processing {url}: {e}")
+            return {"link": url, "text": None}
 @app.get("/api/web_extract")
 async def web_extract(
     url: str,
 ):
     """Extracts text from a given URL."""
     try:
+        result = await fetch_and_extract(url, max_chars)
+        return {"url": url, "text": result["text"]}
     except requests.exceptions.RequestException as e:
         raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
             search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
                                      timelimit=timelimit, backend=backend, max_results=max_results)
+            # Extract text from each result's link asynchronously
+            tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
+            extracted_results = await asyncio.gather(*tasks)
             if extract_only:
                 return JSONResponse(content=jsonable_encoder({extracted_results}))
             else:
                                      timelimit=timelimit, backend=backend,
                                      max_results=max_results)
+            # 2. Extract text from top search result URLs asynchronously
             extracted_text = ""
+            tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
+            extracted_results = await asyncio.gather(*tasks)
+            for result in extracted_results:
+                if result['text']:
+                    extracted_text += f"## Content from: {result['link']}\n\n{result['text']}\n\n"
         # 3. Construct the prompt for the LLM
         llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"