Spaces:

OrganizedProgrammers
/

SERPent

Sleeping

App Files Files Community

Game4all commited on Jun 20

Commit

c38bd79

1 Parent(s): b517821

Deploy scrap endpoints

Browse files

Files changed (4) hide show

app.py +50 -18
requirements.txt +3 -1
scrap.py +63 -0
backends.py → search.py +4 -1

app.py CHANGED Viewed

@@ -1,18 +1,16 @@
 from contextlib import asynccontextmanager
-import json
 from typing import Optional
-from duckduckgo_search import DDGS
-from duckduckgo_search.exceptions import RatelimitException
 import expiringdict
-from fastapi import FastAPI
 from pydantic import BaseModel, Field
 from playwright.async_api import async_playwright, Browser, BrowserContext, Page
-from urllib.parse import quote_plus
 import logging
-import re
 import uvicorn
-from backends import APISearchResults, APIPatentResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents
 logging.basicConfig(
     level=logging.INFO,
@@ -24,13 +22,16 @@ logging.basicConfig(
 playwright = None
 pw_browser: Optional[Browser] = None
 @asynccontextmanager
 async def api_lifespan(app: FastAPI):
     global playwright, pw_browser
     playwright = await async_playwright().start()
     pw_browser = await playwright.chromium.launch(headless=True)
     yield
     await pw_browser.close()
@@ -39,6 +40,20 @@ async def api_lifespan(app: FastAPI):
 app = FastAPI(lifespan=api_lifespan)
 backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
 class APISearchParams(BaseModel):
     queries: list[str] = Field(...,
@@ -53,16 +68,6 @@ async def query_google_scholar(params: APISearchParams):
     return {"error": "Unimplemented"}
-@app.get('/')
-async def status():
-    backend_keys = [k[0] for k in backend_status.items()]
-    backend_status_dict = {}
-    for k in backend_keys:
-        backend_status_dict[k] = backend_status.get(k)
-    return {"status": "running", "backend_status": backend_status_dict}
 @app.post("/search_patents")
 async def search_patents(params: APISearchParams) -> APIPatentResults:
     """Searches google patents for the specified queries and returns the found documents."""
@@ -175,4 +180,31 @@ async def search(params: APISearchParams):
     return APISearchResults(results=results, error=None)
 uvicorn.run(app, host="0.0.0.0", port=7860)

 from contextlib import asynccontextmanager
 from typing import Optional
 import expiringdict
+from fastapi import APIRouter, FastAPI
+from fastapi.routing import APIRouter as Router
+import httpx
 from pydantic import BaseModel, Field
 from playwright.async_api import async_playwright, Browser, BrowserContext, Page
 import logging
 import uvicorn
+from scrap import scrap_patent_async, scrap_patent_bulk_async
+from search import APISearchResults, APIPatentResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents
 logging.basicConfig(
     level=logging.INFO,
 playwright = None
 pw_browser: Optional[Browser] = None
+# httpx client
+httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
+    max_connections=15, max_keepalive_connections=20))
 @asynccontextmanager
 async def api_lifespan(app: FastAPI):
     global playwright, pw_browser
     playwright = await async_playwright().start()
     pw_browser = await playwright.chromium.launch(headless=True)
     yield
     await pw_browser.close()
 app = FastAPI(lifespan=api_lifespan)
 backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
+# Router for scrapping related endpoints
+scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
+@app.get('/')
+async def status():
+    backend_keys = [k[0] for k in backend_status.items()]
+    backend_status_dict = {}
+    for k in backend_keys:
+        backend_status_dict[k] = backend_status.get(k)
+    return {"status": "running", "backend_status": backend_status_dict}
+# ===================== Search endpoints =====================
 class APISearchParams(BaseModel):
     queries: list[str] = Field(...,
     return {"error": "Unimplemented"}
 @app.post("/search_patents")
 async def search_patents(params: APISearchParams) -> APIPatentResults:
     """Searches google patents for the specified queries and returns the found documents."""
     return APISearchResults(results=results, error=None)
+# =========================== Scrapping endpoints ===========================
+@scrap_router.get("/scrap_patent/{patent_id}")
+async def scrap_patent(patent_id: str):
+    """Scraps the specified patent from Google Patents."""
+    patent = await scrap_patent_async(httpx_client, f"https://patents.google.com/patent/{patent_id}/en")
+    return patent
+class ScrapPatentsRequest(BaseModel):
+    """Request model for scrapping multiple patents."""
+    patent_ids: list[str] = Field(...,
+                                  description="List of patent IDs to scrap")
+@scrap_router.post("/scrap_patents_bulk")
+async def scrap_patents(params: ScrapPatentsRequest):
+    """Scraps multiple patents from Google Patents."""
+    patents = await scrap_patent_bulk_async(httpx_client, [
+        f"https://patents.google.com/patent/{pid}/en" for pid in params.patent_ids])
+    return patents
+# ===============================================================================
+app.include_router(scrap_router)
 uvicorn.run(app, host="0.0.0.0", port=7860)

requirements.txt CHANGED Viewed

@@ -3,4 +3,6 @@ uvicorn
 pydantic
 playwright
 duckduckgo_search
-expiringdict

 pydantic
 playwright
 duckduckgo_search
+expiringdict
+beautifulsoup4
+httpx

scrap.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import asyncio
+import logging
+from typing import Optional
+from httpx import AsyncClient
+from bs4 import BeautifulSoup
+from pydantic import BaseModel
+class PatentScrapResult(BaseModel):
+    """Schema for the result of scraping a google patents page."""
+    title: str
+    abstract: Optional[str] = None
+    description: Optional[str] = None
+    claims: Optional[str] = None
+async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
+    headers = {
+        "User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
+    }
+    try:
+        response = await client.get(patent_url, headers=headers)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.text, "html.parser")
+        # Abstract
+        abstract_div = soup.find("div", {"class": "abstract"})
+        abstract = abstract_div.get_text(
+            strip=True) if abstract_div else None
+        # Description
+        description_section = soup.find("section", itemprop="description")
+        description = description_section.get_text(
+            separator="\n", strip=True) if description_section else None
+        # Claims
+        claims_section = soup.find("section", itemprop="claims")
+        claims = claims_section.get_text(
+            separator="\n", strip=True) if claims_section else None
+        # Patent Title
+        meta_title = soup.find("meta", {"name": "DC.title"}).get(
+            "content").strip()
+        return PatentScrapResult(
+            abstract=abstract,
+            description=description,
+            claims=claims,
+            title=meta_title
+        )
+    except Exception as e:
+        logging.error(f"Error scraping {patent_url}: {e}")
+        return None
+async def scrap_patent_bulk_async(client: AsyncClient, patent_urls: list[str]) -> list[PatentScrapResult]:
+    """Scrape multiple patents asynchronously."""
+    tasks = [scrap_patent_async(client, url) for url in patent_urls]
+    results = await asyncio.gather(*tasks)
+    # Filter out None results (failed scrapes)
+    return [res for res in results if res is not None]

backends.py → search.py RENAMED Viewed

@@ -21,6 +21,9 @@ class APISearchResults(BaseModel):
 class BraveSearchBlockedException(Exception):
     """Dummy exception to detect when the headless browser is flagged as suspicious."""
     pass
@@ -36,7 +39,7 @@ async def playwright_open_page(browser: Browser):
         await context.close()
-#TODO: update to return same format for results
 async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
     """Queries google patents for the specified query and number of results. Returns relevant patents"""

 class BraveSearchBlockedException(Exception):
     """Dummy exception to detect when the headless browser is flagged as suspicious."""
+    def __init__(self, *args):
+        super().__init__("Brave Search blocked the request, likely due to flagging browser as suspicious")
     pass
         await context.close()
+# TODO: update to return same format for results
 async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
     """Queries google patents for the specified query and number of results. Returns relevant patents"""