Spaces:

OrganizedProgrammers
/

SERPent

Sleeping

App Files Files Community

Game4all commited on Jun 26

Commit

66641c2

1 Parent(s): 15fdde6

Make scrapping async

Browse files

Files changed (4) hide show

app.py +81 -71
scrap.py +72 -60
serp.py +39 -27
utils.py +9 -0

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from contextlib import asynccontextmanager
 from typing import Optional
 from fastapi import APIRouter, FastAPI
@@ -8,8 +9,9 @@ from playwright.async_api import async_playwright, Browser, BrowserContext, Page
 import logging
 import uvicorn
-from scrap import scrap_patent_async, scrap_patent_bulk_async
-from serp import SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
 logging.basicConfig(
     level=logging.INFO,
@@ -47,95 +49,99 @@ serp_router = APIRouter(prefix="/serp", tags=["serp scrapping"])
 # ===================== Search endpoints =====================
-class SerpQuery(BaseModel):
-    queries: list[str] = Field(...,
-                               description="The list of queries to search for")
-    n_results: int = Field(
-        10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
 @serp_router.post("/search_scholar")
 async def search_google_scholar(params: SerpQuery):
     """Queries google scholar for the specified query"""
-    results = []
-    for q in params.queries:
-        logging.info(f"Searching Google Scholar with query `{q}`")
-        try:
-            res = await query_google_scholar(pw_browser, q, params.n_results)
-            results.extend(res)
-        except Exception as e:
-            logging.error(
-                f"Failed to query Google Scholar with query `{q}`: {e}")
-    return SerpResults(results=results, error=None)
 @serp_router.post("/search_patents")
 async def search_patents(params: SerpQuery) -> SerpResults:
     """Searches google patents for the specified queries and returns the found documents."""
-    results = []
-    for q in params.queries:
-        logging.info(f"Searching Google Patents with query `{q}`")
-        try:
-            res = await query_google_patents(pw_browser, q, params.n_results)
-            results.extend(res)
-        except Exception as e:
-            logging.error(
-                f"Failed to query Google Patents with query `{q}`: {e}")
-    return SerpResults(results=results, error=None)
 @serp_router.post("/search_brave")
 async def search_brave(params: SerpQuery) -> SerpResults:
     """Searches brave search for the specified queries and returns the found documents."""
-    results = []
-    last_exception: Optional[Exception] = None
-    for q in params.queries:
-        logging.info(f"Searching Brave search with query `{q}`")
-        try:
-            res = await query_brave_search(pw_browser, q, params.n_results)
-            results.extend(res)
-        except Exception as e:
-            last_exception = e
-            logging.error(
-                f"Failed to query Brave search with query `{q}`: {e}")
-    return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
 @serp_router.post("/search_bing")
 async def search_bing(params: SerpQuery) -> SerpResults:
     """Searches Bing search for the specified queries and returns the found documents."""
-    results = []
-    last_exception: Optional[Exception] = None
-    for q in params.queries:
-        logging.info(f"Searching Bing search with query `{q}`")
-        try:
-            res = await query_bing_search(pw_browser, q, params.n_results)
-            results.extend(res)
-        except Exception as e:
-            last_exception = e
-            logging.error(
-                f"Failed to query Bing search with query `{q}`: {e}")
-    return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
 @serp_router.post("/search_duck")
 async def search_duck(params: SerpQuery) -> SerpResults:
     """Searches duckduckgo for the specified queries and returns the found documents"""
-    results = []
-    last_exception: Optional[Exception] = None
-    for q in params.queries:
-        logging.info(f"Querying DDG with query: `{q}`")
-        try:
-            res = await query_ddg_search(q, params.n_results)
-            results.extend(res)
-        except Exception as e:
-            last_exception = e
-            logging.error(f"Failed to query DDG with query `{q}`: {e}")
-    return SerpResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
 @serp_router.post("/search")
@@ -180,11 +186,16 @@ async def search(params: SerpQuery):
 # =========================== Scrapping endpoints ===========================
 @scrap_router.get("/scrap_patent/{patent_id}")
 async def scrap_patent(patent_id: str):
     """Scraps the specified patent from Google Patents."""
-    patent = await scrap_patent_async(httpx_client, f"https://patents.google.com/patent/{patent_id}/en")
-    return patent
 class ScrapPatentsRequest(BaseModel):
@@ -193,11 +204,10 @@ class ScrapPatentsRequest(BaseModel):
                                   description="List of patent IDs to scrap")
-@scrap_router.post("/scrap_patents_bulk")
-async def scrap_patents(params: ScrapPatentsRequest):
     """Scraps multiple patents from Google Patents."""
-    patents = await scrap_patent_bulk_async(httpx_client, [
-        f"https://patents.google.com/patent/{pid}/en" for pid in params.patent_ids])
     return patents
 # ===============================================================================
@@ -205,4 +215,4 @@ async def scrap_patents(params: ScrapPatentsRequest):
 app.include_router(serp_router)
 app.include_router(scrap_router)
-uvicorn.run(app, host="0.0.0.0", port=7860)

+import asyncio
 from contextlib import asynccontextmanager
 from typing import Optional
 from fastapi import APIRouter, FastAPI
 import logging
 import uvicorn
+from scrap import PatentScrapBulkResponse, scrap_patent_async, scrap_patent_bulk_async
+from serp import SerpQuery, SerpResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents, query_google_scholar
+from utils import log_gathered_exceptions
 logging.basicConfig(
     level=logging.INFO,
 # ===================== Search endpoints =====================
 @serp_router.post("/search_scholar")
 async def search_google_scholar(params: SerpQuery):
     """Queries google scholar for the specified query"""
+    logging.info(f"Searching Google Scholar for queries: {params.queries}")
+    results = await asyncio.gather(*[query_google_scholar(pw_browser, q, params.n_results) for q in params.queries], return_exceptions=True)
+    log_gathered_exceptions(results, "google scholar search", params)
+    # Filter out exceptions and flatten the results
+    filtered_results = [r for r in results if not isinstance(r, Exception)]
+    flattened_results = [
+        item for sublist in filtered_results for item in sublist]
+    # all queries failed, return the last exception
+    if len(filtered_results) == 0:
+        return SerpResults(results=[], error=str(results[-1]))
+    return SerpResults(results=flattened_results, error=None)
 @serp_router.post("/search_patents")
 async def search_patents(params: SerpQuery) -> SerpResults:
     """Searches google patents for the specified queries and returns the found documents."""
+    logging.info(f"Searching Google Patents for queries: {params.queries}")
+    results = await asyncio.gather(*[query_google_patents(pw_browser, q, params.n_results) for q in params.queries], return_exceptions=True)
+    log_gathered_exceptions(results, "google patent search", params)
+    # Filter out exceptions and flatten the results
+    filtered_results = [r for r in results if not isinstance(r, Exception)]
+    flattened_results = [
+        item for sublist in filtered_results for item in sublist]
+    # all queries failed, return the last exception
+    if len(filtered_results) == 0:
+        return SerpResults(results=[], error=str(results[-1]))
+    return SerpResults(results=flattened_results, error=None)
 @serp_router.post("/search_brave")
 async def search_brave(params: SerpQuery) -> SerpResults:
     """Searches brave search for the specified queries and returns the found documents."""
+    logging.info(f"Searching Brave Search for queries: {params.queries}")
+    results = await asyncio.gather(*[query_brave_search(pw_browser, q, params.n_results) for q in params.queries], return_exceptions=True)
+    log_gathered_exceptions(results, "brave search", params)
+    # Filter out exceptions and flatten the results
+    filtered_results = [r for r in results if not isinstance(r, Exception)]
+    flattened_results = [
+        item for sublist in filtered_results for item in sublist]
+    # all queries failed, return the last exception
+    if len(filtered_results) == 0:
+        return SerpResults(results=[], error=str(results[-1]))
+    return SerpResults(results=flattened_results, error=None)
 @serp_router.post("/search_bing")
 async def search_bing(params: SerpQuery) -> SerpResults:
     """Searches Bing search for the specified queries and returns the found documents."""
+    logging.info(f"Searching Bing Search for queries: {params.queries}")
+    results = await asyncio.gather(*[query_bing_search(pw_browser, q, params.n_results) for q in params.queries], return_exceptions=True)
+    log_gathered_exceptions(results, "bing search", params)
+    # Filter out exceptions and flatten the results
+    filtered_results = [r for r in results if not isinstance(r, Exception)]
+    flattened_results = [
+        item for sublist in filtered_results for item in sublist]
+    # all queries failed, return the last exception
+    if len(filtered_results) == 0:
+        return SerpResults(results=[], error=str(results[-1]))
+    return SerpResults(results=flattened_results, error=None)
 @serp_router.post("/search_duck")
 async def search_duck(params: SerpQuery) -> SerpResults:
     """Searches duckduckgo for the specified queries and returns the found documents"""
+    logging.info(f"Searching DuckDuckGo for queries: {params.queries}")
+    results = await asyncio.gather(*[query_ddg_search(q, params.n_results) for q in params.queries], return_exceptions=True)
+    log_gathered_exceptions(results, "duckduckgo search", params)
+    # Filter out exceptions and flatten the results
+    filtered_results = [r for r in results if not isinstance(r, Exception)]
+    flattened_results = [
+        item for sublist in filtered_results for item in sublist]
+    # all queries failed, return the last exception
+    if len(filtered_results) == 0:
+        return SerpResults(results=[], error=str(results[-1]))
+    return SerpResults(results=flattened_results, error=None)
 @serp_router.post("/search")
 # =========================== Scrapping endpoints ===========================
+# TODO: return a proper error response if the patent is not found or scrapping fails
 @scrap_router.get("/scrap_patent/{patent_id}")
 async def scrap_patent(patent_id: str):
     """Scraps the specified patent from Google Patents."""
+    try:
+        patent = await scrap_patent_async(httpx_client, f"https://patents.google.com/patent/{patent_id}/en")
+        return patent
+    except Exception as e:
+        logging.warning(f"Failed to scrap patent {patent_id}: {e}")
+        return None
 class ScrapPatentsRequest(BaseModel):
                                   description="List of patent IDs to scrap")
+@scrap_router.post("/scrap_patents_bulk", response_model=PatentScrapBulkResponse)
+async def scrap_patents(params: ScrapPatentsRequest) -> PatentScrapBulkResponse:
     """Scraps multiple patents from Google Patents."""
+    patents = await scrap_patent_bulk_async(httpx_client, params.patent_ids)
     return patents
 # ===============================================================================
 app.include_router(serp_router)
 app.include_router(scrap_router)
+uvicorn.run(app, host="127.0.0.1", port=7860)

scrap.py CHANGED Viewed

@@ -27,65 +27,77 @@ async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScra
     headers = {
         "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
     }
-    try:
-        response = await client.get(patent_url, headers=headers)
-        response.raise_for_status()
-        soup = BeautifulSoup(response.text, "html.parser")
-        # Abstract
-        abstract_div = soup.find("div", {"class": "abstract"})
-        abstract = abstract_div.get_text(
-            strip=True) if abstract_div else None
-        # Description
-        description_section = soup.find("section", itemprop="description")
-        description = description_section.get_text(
-            separator="\n", strip=True) if description_section else None
-        # Field of the Invention
-        invention_field_match = re.findall(
-            r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
-        invention_field = invention_field_match[0][1].strip(
-        ) if invention_field_match else None
-        # Background of the Invention
-        invention_background_match = re.findall(
-            r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
-        invention_background = invention_background_match[0][1].strip(
-        ) if invention_background_match else None
-        # Claims
-        claims_section = soup.find("section", itemprop="claims")
-        claims = claims_section.get_text(
-            separator="\n", strip=True) if claims_section else None
-        # Patent Title
-        meta_title = soup.find("meta", {"name": "DC.title"}).get(
-            "content").strip()
-        # Patent publication number
-        # pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
-        # get the h2 with id ="pubnum" and extract the text
-        return PatentScrapResult(
-            # publication_number=pub_num,
-            abstract=abstract,
-            description=description,
-            claims=claims,
-            title=meta_title,
-            field_of_invention=invention_field,
-            background=invention_background
-        )
-    except Exception as e:
-        logging.error(f"Error scraping {patent_url}: {e}")
-        return None
-async def scrap_patent_bulk_async(client: AsyncClient, patent_urls: list[str]) -> list[PatentScrapResult]:
     """Scrape multiple patents asynchronously."""
-    tasks = [scrap_patent_async(client, url) for url in patent_urls]
-    results = await asyncio.gather(*tasks)
-    # Filter out None results (failed scrapes)
-    return [res for res in results if res is not None]

     headers = {
         "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
     }
+    response = await client.get(patent_url, headers=headers)
+    response.raise_for_status()
+    soup = BeautifulSoup(response.text, "html.parser")
+    # Abstract
+    abstract_div = soup.find("div", {"class": "abstract"})
+    abstract = abstract_div.get_text(
+        strip=True) if abstract_div else None
+    # Description
+    description_section = soup.find("section", itemprop="description")
+    description = description_section.get_text(
+        separator="\n", strip=True) if description_section else None
+    # Field of the Invention
+    invention_field_match = re.findall(
+        r"(FIELD OF THE INVENTION|TECHNICAL FIELD)(.*?)(?:(BACKGROUND|BACKGROUND OF THE INVENTION|SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE RELATED ART))", description, re.IGNORECASE | re.DOTALL) if description_section else None
+    invention_field = invention_field_match[0][1].strip(
+    ) if invention_field_match else None
+    # Background of the Invention
+    invention_background_match = re.findall(
+        r"(BACKGROUND OF THE INVENTION|BACKGROUND)(.*?)(?:(SUMMARY|BRIEF SUMMARY|DETAILED DESCRIPTION|DESCRIPTION OF THE PREFERRED EMBODIMENTS|DESCRIPTION))", description, re.IGNORECASE | re.DOTALL) if description_section else None
+    invention_background = invention_background_match[0][1].strip(
+    ) if invention_background_match else None
+    # Claims
+    claims_section = soup.find("section", itemprop="claims")
+    claims = claims_section.get_text(
+        separator="\n", strip=True) if claims_section else None
+    # Patent Title
+    meta_title = soup.find("meta", {"name": "DC.title"}).get(
+        "content").strip()
+    # Patent publication number
+    # pub_num = soup.select_one("h2#pubnum").get_text(strip=True)
+    # get the h2 with id ="pubnum" and extract the text
+    return PatentScrapResult(
+        # publication_number=pub_num,
+        abstract=abstract,
+        description=description,
+        claims=claims,
+        title=meta_title,
+        field_of_invention=invention_field,
+        background=invention_background
+    )
+class PatentScrapBulkResponse(BaseModel):
+    """Response model for bulk patent scraping."""
+    patents: list[PatentScrapResult]
+    failed_ids: list[str]
+async def scrap_patent_bulk_async(client: AsyncClient, patent_ids: list[int]) -> PatentScrapBulkResponse:
     """Scrape multiple patents asynchronously."""
+    urls = [
+        f"https://patents.google.com/patent/{pid}/en" for pid in patent_ids]
+    results = await asyncio.gather(*[scrap_patent_async(client, url) for url in urls], return_exceptions=True)
+    filtered_results = [
+        res for res in results if not isinstance(res, Exception)]
+    failed_ids = [
+        patent_ids[i] for i, res in enumerate(results) if isinstance(res, Exception)
+    ]
+    return PatentScrapBulkResponse(
+        patents=filtered_results,
+        failed_ids=failed_ids
+    )

serp.py CHANGED Viewed

@@ -1,11 +1,24 @@
 from contextlib import asynccontextmanager
 from typing import Optional
 from duckduckgo_search import DDGS
-from pydantic import BaseModel
 from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
 from urllib.parse import quote_plus
 import logging
 import re
 class SerpResults(BaseModel):
     """Model for SERP scrapping results"""
@@ -21,16 +34,20 @@ class BraveSearchBlockedException(Exception):
     pass
 @asynccontextmanager
 async def playwright_open_page(browser: Browser):
     """Context manager for playwright pages"""
-    context: BrowserContext = await browser.new_context()
-    page: Page = await context.new_page()
-    try:
-        yield page
-    finally:
-        await page.close()
-        await context.close()
 async def query_google_scholar(browser: Browser, q: str, n_results: int = 10):
@@ -145,28 +162,23 @@ async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
         results = []
-        try:
-            for result in results_cards:
-                title = await result.locator('.title').all_inner_texts()
-                description = await result.locator('.snippet-description').all_inner_texts()
-                url = await result.locator('a').nth(0).get_attribute('href')
-                # Filter out results with no URL or brave-specific URLs
-                if url is None or url.startswith('/'):
-                    continue
-                results.append({
-                    "title": title[0] if title else "",
-                    "body": description[0] if description else "",
-                    "href": url
-                })
-                if len(results) >= n_results:
-                    break
-        except TimeoutError as e:
-            logging.warning(
-                f"Timeout on selector while parsing Brave Search SERP: {e}")
         return results

 from contextlib import asynccontextmanager
 from typing import Optional
 from duckduckgo_search import DDGS
+from pydantic import BaseModel, Field
 from playwright.async_api import Browser, BrowserContext, Page, TimeoutError
 from urllib.parse import quote_plus
 import logging
 import re
+from asyncio import Semaphore
+# Concurrency limit for Playwright browser contexts.
+# This is to prevent too many concurrent browser contexts from being created,
+PLAYWRIGHT_CONCURRENCY_LIMIT = 10
+class SerpQuery(BaseModel):
+    queries: list[str] = Field(...,
+                               description="The list of queries to search for")
+    n_results: int = Field(
+        10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
 class SerpResults(BaseModel):
     """Model for SERP scrapping results"""
     pass
+_PLAYWRIGHT_CONCURRENCY_SEMAPHORE = Semaphore(PLAYWRIGHT_CONCURRENCY_LIMIT)
 @asynccontextmanager
 async def playwright_open_page(browser: Browser):
     """Context manager for playwright pages"""
+    async with _PLAYWRIGHT_CONCURRENCY_SEMAPHORE:
+        context: BrowserContext = await browser.new_context()
+        page: Page = await context.new_page()
+        try:
+            yield page
+        finally:
+            await page.close()
+            await context.close()
 async def query_google_scholar(browser: Browser, q: str, n_results: int = 10):
         results = []
+        for result in results_cards:
+            title = await result.locator('.title').all_inner_texts()
+            description = await result.locator('.snippet-description').all_inner_texts()
+            url = await result.locator('a').nth(0).get_attribute('href')
+            # Filter out results with no URL or brave-specific URLs
+            if url is None or url.startswith('/'):
+                continue
+            results.append({
+                "title": title[0] if title else "",
+                "body": description[0] if description else "",
+                "href": url
+            })
+            if len(results) >= n_results:
+                break
         return results

utils.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from asyncio.log import logger
+from serp import SerpQuery
+def log_gathered_exceptions(results: list, context: str, params: SerpQuery):
+    """Logs gathered exceptions with context and parameters."""
+    for exc, q in zip(results, params.queries):
+        if isinstance(exc, Exception):
+            logger.warning(f"Error during {context} for query '{q}': {exc}")