Spaces:
Sleeping
Sleeping
Reorganize endpoints
Browse files
README.md
CHANGED
|
@@ -9,4 +9,6 @@ short_description: A SERP scrapping API for AI projects
|
|
| 9 |
---
|
| 10 |
|
| 11 |
|
| 12 |
-
# `SERPent`
|
|
|
|
|
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
|
| 12 |
+
# `SERPent`
|
| 13 |
+
|
| 14 |
+
`SERPent` provides a SERP / scrapping API for use by AI agents / projects.
|
app.py
CHANGED
|
@@ -42,6 +42,9 @@ backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
|
|
| 42 |
|
| 43 |
# Router for scrapping related endpoints
|
| 44 |
scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
@app.get('/')
|
| 47 |
async def status():
|
|
@@ -62,13 +65,13 @@ class APISearchParams(BaseModel):
|
|
| 62 |
10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
|
| 63 |
|
| 64 |
|
| 65 |
-
@
|
| 66 |
async def query_google_scholar(params: APISearchParams):
|
| 67 |
"""Queries google scholar for the specified query"""
|
| 68 |
return {"error": "Unimplemented"}
|
| 69 |
|
| 70 |
|
| 71 |
-
@
|
| 72 |
async def search_patents(params: APISearchParams) -> APIPatentResults:
|
| 73 |
"""Searches google patents for the specified queries and returns the found documents."""
|
| 74 |
results = []
|
|
@@ -84,7 +87,7 @@ async def search_patents(params: APISearchParams) -> APIPatentResults:
|
|
| 84 |
return APIPatentResults(results=results, error=None)
|
| 85 |
|
| 86 |
|
| 87 |
-
@
|
| 88 |
async def search_brave(params: APISearchParams) -> APISearchResults:
|
| 89 |
"""Searches brave search for the specified queries and returns the found documents."""
|
| 90 |
results = []
|
|
@@ -103,7 +106,7 @@ async def search_brave(params: APISearchParams) -> APISearchResults:
|
|
| 103 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
| 104 |
|
| 105 |
|
| 106 |
-
@
|
| 107 |
async def search_bing(params: APISearchParams) -> APISearchResults:
|
| 108 |
"""Searches Bing search for the specified queries and returns the found documents."""
|
| 109 |
results = []
|
|
@@ -111,7 +114,7 @@ async def search_bing(params: APISearchParams) -> APISearchResults:
|
|
| 111 |
for q in params.queries:
|
| 112 |
logging.info(f"Searching Bing search with query `{q}`")
|
| 113 |
try:
|
| 114 |
-
res = await
|
| 115 |
results.extend(res)
|
| 116 |
except Exception as e:
|
| 117 |
last_exception = e
|
|
@@ -122,7 +125,7 @@ async def search_bing(params: APISearchParams) -> APISearchResults:
|
|
| 122 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
| 123 |
|
| 124 |
|
| 125 |
-
@
|
| 126 |
async def search_duck(params: APISearchParams) -> APISearchResults:
|
| 127 |
"""Searches duckduckgo for the specified queries and returns the found documents"""
|
| 128 |
results = []
|
|
@@ -141,6 +144,7 @@ async def search_duck(params: APISearchParams) -> APISearchResults:
|
|
| 141 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
| 142 |
|
| 143 |
|
|
|
|
| 144 |
@app.post("/search")
|
| 145 |
async def search(params: APISearchParams):
|
| 146 |
"""Attempts to search the specified queries using ALL backends"""
|
|
@@ -205,6 +209,7 @@ async def scrap_patents(params: ScrapPatentsRequest):
|
|
| 205 |
|
| 206 |
# ===============================================================================
|
| 207 |
|
|
|
|
| 208 |
app.include_router(scrap_router)
|
| 209 |
|
| 210 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
| 42 |
|
| 43 |
# Router for scrapping related endpoints
|
| 44 |
scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
|
| 45 |
+
# Router for search related endpoints
|
| 46 |
+
search_router = APIRouter(prefix="/search", tags=["search"])
|
| 47 |
+
|
| 48 |
|
| 49 |
@app.get('/')
|
| 50 |
async def status():
|
|
|
|
| 65 |
10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
|
| 66 |
|
| 67 |
|
| 68 |
+
@search_router.post("/search_scholar")
|
| 69 |
async def query_google_scholar(params: APISearchParams):
|
| 70 |
"""Queries google scholar for the specified query"""
|
| 71 |
return {"error": "Unimplemented"}
|
| 72 |
|
| 73 |
|
| 74 |
+
@search_router.post("/search_patents")
|
| 75 |
async def search_patents(params: APISearchParams) -> APIPatentResults:
|
| 76 |
"""Searches google patents for the specified queries and returns the found documents."""
|
| 77 |
results = []
|
|
|
|
| 87 |
return APIPatentResults(results=results, error=None)
|
| 88 |
|
| 89 |
|
| 90 |
+
@search_router.post("/search_brave")
|
| 91 |
async def search_brave(params: APISearchParams) -> APISearchResults:
|
| 92 |
"""Searches brave search for the specified queries and returns the found documents."""
|
| 93 |
results = []
|
|
|
|
| 106 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
| 107 |
|
| 108 |
|
| 109 |
+
@search_router.post("/search_bing")
|
| 110 |
async def search_bing(params: APISearchParams) -> APISearchResults:
|
| 111 |
"""Searches Bing search for the specified queries and returns the found documents."""
|
| 112 |
results = []
|
|
|
|
| 114 |
for q in params.queries:
|
| 115 |
logging.info(f"Searching Bing search with query `{q}`")
|
| 116 |
try:
|
| 117 |
+
res = await query_bing_search(pw_browser, q, params.n_results)
|
| 118 |
results.extend(res)
|
| 119 |
except Exception as e:
|
| 120 |
last_exception = e
|
|
|
|
| 125 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
| 126 |
|
| 127 |
|
| 128 |
+
@search_router.post("/search_duck")
|
| 129 |
async def search_duck(params: APISearchParams) -> APISearchResults:
|
| 130 |
"""Searches duckduckgo for the specified queries and returns the found documents"""
|
| 131 |
results = []
|
|
|
|
| 144 |
return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
|
| 145 |
|
| 146 |
|
| 147 |
+
@search_router.post("/search")
|
| 148 |
@app.post("/search")
|
| 149 |
async def search(params: APISearchParams):
|
| 150 |
"""Attempts to search the specified queries using ALL backends"""
|
|
|
|
| 209 |
|
| 210 |
# ===============================================================================
|
| 211 |
|
| 212 |
+
app.include_router(search_router)
|
| 213 |
app.include_router(scrap_router)
|
| 214 |
|
| 215 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
search.py
CHANGED
|
@@ -39,7 +39,6 @@ async def playwright_open_page(browser: Browser):
|
|
| 39 |
await context.close()
|
| 40 |
|
| 41 |
|
| 42 |
-
# TODO: update to return same format for results
|
| 43 |
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
| 44 |
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
| 45 |
|
|
@@ -65,17 +64,33 @@ async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
|
| 65 |
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
|
| 66 |
|
| 67 |
items = await page.locator("search-result-item").all()
|
| 68 |
-
|
| 69 |
for item in items:
|
|
|
|
| 70 |
all_text = " ".join(await item.locator("span").all_inner_texts())
|
| 71 |
found = re.findall(PATENT_ID_REGEX, all_text)
|
| 72 |
-
if found:
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
return
|
| 79 |
|
| 80 |
|
| 81 |
async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
|
|
|
|
| 39 |
await context.close()
|
| 40 |
|
| 41 |
|
|
|
|
| 42 |
async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
|
| 43 |
"""Queries google patents for the specified query and number of results. Returns relevant patents"""
|
| 44 |
|
|
|
|
| 64 |
PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
|
| 65 |
|
| 66 |
items = await page.locator("search-result-item").all()
|
| 67 |
+
results = []
|
| 68 |
for item in items:
|
| 69 |
+
# Extract all inner texts from spans (still used for patent ID)
|
| 70 |
all_text = " ".join(await item.locator("span").all_inner_texts())
|
| 71 |
found = re.findall(PATENT_ID_REGEX, all_text)
|
| 72 |
+
if not found:
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
# get the first match as patent ID
|
| 76 |
+
patent_id = found[0]
|
| 77 |
+
|
| 78 |
+
# extract patent title
|
| 79 |
+
title = await item.locator("h3, h4").first.inner_text(timeout=1000)
|
| 80 |
+
|
| 81 |
+
# extract patent body
|
| 82 |
+
snippet_locator = item.locator(
|
| 83 |
+
"div.abstract, div.result-snippet, .snippet, .result-text")
|
| 84 |
+
body = await snippet_locator.first.inner_text(timeout=1000)
|
| 85 |
|
| 86 |
+
results.append({
|
| 87 |
+
"id": patent_id,
|
| 88 |
+
"href": f"https://patents.google.com/patent/{patent_id}/en",
|
| 89 |
+
"title": title,
|
| 90 |
+
"body": body
|
| 91 |
+
})
|
| 92 |
|
| 93 |
+
return results[:n_results]
|
| 94 |
|
| 95 |
|
| 96 |
async def query_brave_search(browser: Browser, q: str, n_results: int = 10):
|