Spaces:
Sleeping
Sleeping
Deploy scrap endpoints
Browse files- app.py +50 -18
- requirements.txt +3 -1
- scrap.py +63 -0
- backends.py → search.py +4 -1
app.py
CHANGED
|
@@ -1,18 +1,16 @@
|
|
| 1 |
from contextlib import asynccontextmanager
|
| 2 |
-
import json
|
| 3 |
from typing import Optional
|
| 4 |
-
from duckduckgo_search import DDGS
|
| 5 |
-
from duckduckgo_search.exceptions import RatelimitException
|
| 6 |
import expiringdict
|
| 7 |
-
from fastapi import FastAPI
|
|
|
|
|
|
|
| 8 |
from pydantic import BaseModel, Field
|
| 9 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
| 10 |
-
from urllib.parse import quote_plus
|
| 11 |
import logging
|
| 12 |
-
import re
|
| 13 |
import uvicorn
|
| 14 |
|
| 15 |
-
from
|
|
|
|
| 16 |
|
| 17 |
logging.basicConfig(
|
| 18 |
level=logging.INFO,
|
|
@@ -24,13 +22,16 @@ logging.basicConfig(
|
|
| 24 |
playwright = None
|
| 25 |
pw_browser: Optional[Browser] = None
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
@asynccontextmanager
|
| 29 |
async def api_lifespan(app: FastAPI):
|
| 30 |
global playwright, pw_browser
|
| 31 |
playwright = await async_playwright().start()
|
| 32 |
pw_browser = await playwright.chromium.launch(headless=True)
|
| 33 |
-
|
| 34 |
yield
|
| 35 |
|
| 36 |
await pw_browser.close()
|
|
@@ -39,6 +40,20 @@ async def api_lifespan(app: FastAPI):
|
|
| 39 |
app = FastAPI(lifespan=api_lifespan)
|
| 40 |
backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
class APISearchParams(BaseModel):
|
| 44 |
queries: list[str] = Field(...,
|
|
@@ -53,16 +68,6 @@ async def query_google_scholar(params: APISearchParams):
|
|
| 53 |
return {"error": "Unimplemented"}
|
| 54 |
|
| 55 |
|
| 56 |
-
@app.get('/')
|
| 57 |
-
async def status():
|
| 58 |
-
backend_keys = [k[0] for k in backend_status.items()]
|
| 59 |
-
backend_status_dict = {}
|
| 60 |
-
|
| 61 |
-
for k in backend_keys:
|
| 62 |
-
backend_status_dict[k] = backend_status.get(k)
|
| 63 |
-
return {"status": "running", "backend_status": backend_status_dict}
|
| 64 |
-
|
| 65 |
-
|
| 66 |
@app.post("/search_patents")
|
| 67 |
async def search_patents(params: APISearchParams) -> APIPatentResults:
|
| 68 |
"""Searches google patents for the specified queries and returns the found documents."""
|
|
@@ -175,4 +180,31 @@ async def search(params: APISearchParams):
|
|
| 175 |
|
| 176 |
return APISearchResults(results=results, error=None)
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
| 1 |
from contextlib import asynccontextmanager
|
|
|
|
| 2 |
from typing import Optional
|
|
|
|
|
|
|
| 3 |
import expiringdict
|
| 4 |
+
from fastapi import APIRouter, FastAPI
|
| 5 |
+
from fastapi.routing import APIRouter as Router
|
| 6 |
+
import httpx
|
| 7 |
from pydantic import BaseModel, Field
|
| 8 |
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
|
|
|
| 9 |
import logging
|
|
|
|
| 10 |
import uvicorn
|
| 11 |
|
| 12 |
+
from scrap import scrap_patent_async, scrap_patent_bulk_async
|
| 13 |
+
from search import APISearchResults, APIPatentResults, query_bing_search, query_brave_search, query_ddg_search, query_google_patents
|
| 14 |
|
| 15 |
logging.basicConfig(
|
| 16 |
level=logging.INFO,
|
|
|
|
| 22 |
playwright = None
|
| 23 |
pw_browser: Optional[Browser] = None
|
| 24 |
|
| 25 |
+
# httpx client
|
| 26 |
+
httpx_client = httpx.AsyncClient(timeout=30, limits=httpx.Limits(
|
| 27 |
+
max_connections=15, max_keepalive_connections=20))
|
| 28 |
+
|
| 29 |
|
| 30 |
@asynccontextmanager
|
| 31 |
async def api_lifespan(app: FastAPI):
|
| 32 |
global playwright, pw_browser
|
| 33 |
playwright = await async_playwright().start()
|
| 34 |
pw_browser = await playwright.chromium.launch(headless=True)
|
|
|
|
| 35 |
yield
|
| 36 |
|
| 37 |
await pw_browser.close()
|
|
|
|
| 40 |
app = FastAPI(lifespan=api_lifespan)
|
| 41 |
backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
|
| 42 |
|
| 43 |
+
# Router for scrapping related endpoints
|
| 44 |
+
scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
|
| 45 |
+
|
| 46 |
+
@app.get('/')
|
| 47 |
+
async def status():
|
| 48 |
+
backend_keys = [k[0] for k in backend_status.items()]
|
| 49 |
+
backend_status_dict = {}
|
| 50 |
+
|
| 51 |
+
for k in backend_keys:
|
| 52 |
+
backend_status_dict[k] = backend_status.get(k)
|
| 53 |
+
return {"status": "running", "backend_status": backend_status_dict}
|
| 54 |
+
|
| 55 |
+
# ===================== Search endpoints =====================
|
| 56 |
+
|
| 57 |
|
| 58 |
class APISearchParams(BaseModel):
|
| 59 |
queries: list[str] = Field(...,
|
|
|
|
| 68 |
return {"error": "Unimplemented"}
|
| 69 |
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
@app.post("/search_patents")
|
| 72 |
async def search_patents(params: APISearchParams) -> APIPatentResults:
|
| 73 |
"""Searches google patents for the specified queries and returns the found documents."""
|
|
|
|
| 180 |
|
| 181 |
return APISearchResults(results=results, error=None)
|
| 182 |
|
| 183 |
+
# =========================== Scrapping endpoints ===========================
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
@scrap_router.get("/scrap_patent/{patent_id}")
|
| 187 |
+
async def scrap_patent(patent_id: str):
|
| 188 |
+
"""Scraps the specified patent from Google Patents."""
|
| 189 |
+
patent = await scrap_patent_async(httpx_client, f"https://patents.google.com/patent/{patent_id}/en")
|
| 190 |
+
return patent
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
class ScrapPatentsRequest(BaseModel):
|
| 194 |
+
"""Request model for scrapping multiple patents."""
|
| 195 |
+
patent_ids: list[str] = Field(...,
|
| 196 |
+
description="List of patent IDs to scrap")
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
@scrap_router.post("/scrap_patents_bulk")
|
| 200 |
+
async def scrap_patents(params: ScrapPatentsRequest):
|
| 201 |
+
"""Scraps multiple patents from Google Patents."""
|
| 202 |
+
patents = await scrap_patent_bulk_async(httpx_client, [
|
| 203 |
+
f"https://patents.google.com/patent/{pid}/en" for pid in params.patent_ids])
|
| 204 |
+
return patents
|
| 205 |
+
|
| 206 |
+
# ===============================================================================
|
| 207 |
+
|
| 208 |
+
app.include_router(scrap_router)
|
| 209 |
+
|
| 210 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
requirements.txt
CHANGED
|
@@ -3,4 +3,6 @@ uvicorn
|
|
| 3 |
pydantic
|
| 4 |
playwright
|
| 5 |
duckduckgo_search
|
| 6 |
-
expiringdict
|
|
|
|
|
|
|
|
|
| 3 |
pydantic
|
| 4 |
playwright
|
| 5 |
duckduckgo_search
|
| 6 |
+
expiringdict
|
| 7 |
+
beautifulsoup4
|
| 8 |
+
httpx
|
scrap.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import logging
|
| 3 |
+
from typing import Optional
|
| 4 |
+
from httpx import AsyncClient
|
| 5 |
+
from bs4 import BeautifulSoup
|
| 6 |
+
from pydantic import BaseModel
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class PatentScrapResult(BaseModel):
|
| 10 |
+
"""Schema for the result of scraping a google patents page."""
|
| 11 |
+
title: str
|
| 12 |
+
abstract: Optional[str] = None
|
| 13 |
+
description: Optional[str] = None
|
| 14 |
+
claims: Optional[str] = None
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
async def scrap_patent_async(client: AsyncClient, patent_url: str) -> PatentScrapResult:
|
| 18 |
+
headers = {
|
| 19 |
+
"User-Agent": "Mozilla/5.0 (compatible; GPTBot/1.0; +https://openai.com/gptbot)"
|
| 20 |
+
}
|
| 21 |
+
try:
|
| 22 |
+
response = await client.get(patent_url, headers=headers)
|
| 23 |
+
response.raise_for_status()
|
| 24 |
+
|
| 25 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 26 |
+
|
| 27 |
+
# Abstract
|
| 28 |
+
abstract_div = soup.find("div", {"class": "abstract"})
|
| 29 |
+
abstract = abstract_div.get_text(
|
| 30 |
+
strip=True) if abstract_div else None
|
| 31 |
+
|
| 32 |
+
# Description
|
| 33 |
+
description_section = soup.find("section", itemprop="description")
|
| 34 |
+
description = description_section.get_text(
|
| 35 |
+
separator="\n", strip=True) if description_section else None
|
| 36 |
+
|
| 37 |
+
# Claims
|
| 38 |
+
claims_section = soup.find("section", itemprop="claims")
|
| 39 |
+
claims = claims_section.get_text(
|
| 40 |
+
separator="\n", strip=True) if claims_section else None
|
| 41 |
+
|
| 42 |
+
# Patent Title
|
| 43 |
+
meta_title = soup.find("meta", {"name": "DC.title"}).get(
|
| 44 |
+
"content").strip()
|
| 45 |
+
|
| 46 |
+
return PatentScrapResult(
|
| 47 |
+
abstract=abstract,
|
| 48 |
+
description=description,
|
| 49 |
+
claims=claims,
|
| 50 |
+
title=meta_title
|
| 51 |
+
)
|
| 52 |
+
except Exception as e:
|
| 53 |
+
logging.error(f"Error scraping {patent_url}: {e}")
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
async def scrap_patent_bulk_async(client: AsyncClient, patent_urls: list[str]) -> list[PatentScrapResult]:
|
| 58 |
+
"""Scrape multiple patents asynchronously."""
|
| 59 |
+
tasks = [scrap_patent_async(client, url) for url in patent_urls]
|
| 60 |
+
results = await asyncio.gather(*tasks)
|
| 61 |
+
|
| 62 |
+
# Filter out None results (failed scrapes)
|
| 63 |
+
return [res for res in results if res is not None]
|
backends.py → search.py
RENAMED
|
@@ -21,6 +21,9 @@ class APISearchResults(BaseModel):
|
|
| 21 |
|
| 22 |
class BraveSearchBlockedException(Exception):
|
| 23 |
"""Dummy exception to detect when the headless browser is flagged as suspicious."""
|
|
|
|
|
|
|
|
|
|
| 24 |
pass
|
| 25 |
|
| 26 |
|
|
@@ -36,7 +39,7 @@ async def playwright_open_page(browser: Browser):
|
|
| 36 |
await context.close()
|
| 37 |
|
| 38 |
|
| 39 |
-
#TODO: update to return same format for results
|
| 40 |
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
| 41 |
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
| 42 |
|
|
|
|
| 21 |
|
| 22 |
class BraveSearchBlockedException(Exception):
|
| 23 |
"""Dummy exception to detect when the headless browser is flagged as suspicious."""
|
| 24 |
+
|
| 25 |
+
def __init__(self, *args):
|
| 26 |
+
super().__init__("Brave Search blocked the request, likely due to flagging browser as suspicious")
|
| 27 |
pass
|
| 28 |
|
| 29 |
|
|
|
|
| 39 |
await context.close()
|
| 40 |
|
| 41 |
|
| 42 |
+
# TODO: update to return same format for results
|
| 43 |
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
| 44 |
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
| 45 |
|