Spaces:

OrganizedProgrammers
/

SERPent

Sleeping

App Files Files Community

Game4all commited on Jun 21

Commit

8d6fbc5

1 Parent(s): cf1c265

Reorganize endpoints

Browse files

Files changed (3) hide show

README.md +3 -1
app.py +11 -6
search.py +22 -7

README.md CHANGED Viewed

@@ -9,4 +9,6 @@ short_description: A SERP scrapping API for AI projects
 ---
-# `SERPent`

 ---
+# `SERPent`
+`SERPent` provides a SERP / scrapping API for use by AI agents / projects.

app.py CHANGED Viewed

@@ -42,6 +42,9 @@ backend_status = expiringdict.ExpiringDict(max_len=5, max_age_seconds=15*60)
 # Router for scrapping related endpoints
 scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
 @app.get('/')
 async def status():
@@ -62,13 +65,13 @@ class APISearchParams(BaseModel):
         10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
-@app.post("/search_scholar")
 async def query_google_scholar(params: APISearchParams):
     """Queries google scholar for the specified query"""
     return {"error": "Unimplemented"}
-@app.post("/search_patents")
 async def search_patents(params: APISearchParams) -> APIPatentResults:
     """Searches google patents for the specified queries and returns the found documents."""
     results = []
@@ -84,7 +87,7 @@ async def search_patents(params: APISearchParams) -> APIPatentResults:
     return APIPatentResults(results=results, error=None)
-@app.post("/search_brave")
 async def search_brave(params: APISearchParams) -> APISearchResults:
     """Searches brave search for the specified queries and returns the found documents."""
     results = []
@@ -103,7 +106,7 @@ async def search_brave(params: APISearchParams) -> APISearchResults:
     return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
-@app.post("/search_bing")
 async def search_bing(params: APISearchParams) -> APISearchResults:
     """Searches Bing search for the specified queries and returns the found documents."""
     results = []
@@ -111,7 +114,7 @@ async def search_bing(params: APISearchParams) -> APISearchResults:
     for q in params.queries:
         logging.info(f"Searching Bing search with query `{q}`")
         try:
-            res = await query_brave_search(pw_browser, q, params.n_results)
             results.extend(res)
         except Exception as e:
             last_exception = e
@@ -122,7 +125,7 @@ async def search_bing(params: APISearchParams) -> APISearchResults:
     return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
-@app.post("/search_duck")
 async def search_duck(params: APISearchParams) -> APISearchResults:
     """Searches duckduckgo for the specified queries and returns the found documents"""
     results = []
@@ -141,6 +144,7 @@ async def search_duck(params: APISearchParams) -> APISearchResults:
     return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
 @app.post("/search")
 async def search(params: APISearchParams):
     """Attempts to search the specified queries using ALL backends"""
@@ -205,6 +209,7 @@ async def scrap_patents(params: ScrapPatentsRequest):
 # ===============================================================================
 app.include_router(scrap_router)
 uvicorn.run(app, host="0.0.0.0", port=7860)

 # Router for scrapping related endpoints
 scrap_router = APIRouter(prefix="/scrap", tags=["scrapping"])
+# Router for search related endpoints
+search_router = APIRouter(prefix="/search", tags=["search"])
 @app.get('/')
 async def status():
         10, description="Number of results to return for each query. Valid values are 10, 25, 50 and 100")
+@search_router.post("/search_scholar")
 async def query_google_scholar(params: APISearchParams):
     """Queries google scholar for the specified query"""
     return {"error": "Unimplemented"}
+@search_router.post("/search_patents")
 async def search_patents(params: APISearchParams) -> APIPatentResults:
     """Searches google patents for the specified queries and returns the found documents."""
     results = []
     return APIPatentResults(results=results, error=None)
+@search_router.post("/search_brave")
 async def search_brave(params: APISearchParams) -> APISearchResults:
     """Searches brave search for the specified queries and returns the found documents."""
     results = []
     return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
+@search_router.post("/search_bing")
 async def search_bing(params: APISearchParams) -> APISearchResults:
     """Searches Bing search for the specified queries and returns the found documents."""
     results = []
     for q in params.queries:
         logging.info(f"Searching Bing search with query `{q}`")
         try:
+            res = await query_bing_search(pw_browser, q, params.n_results)
             results.extend(res)
         except Exception as e:
             last_exception = e
     return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
+@search_router.post("/search_duck")
 async def search_duck(params: APISearchParams) -> APISearchResults:
     """Searches duckduckgo for the specified queries and returns the found documents"""
     results = []
     return APISearchResults(results=results, error=str(last_exception) if len(results) == 0 and last_exception else None)
+@search_router.post("/search")
 @app.post("/search")
 async def search(params: APISearchParams):
     """Attempts to search the specified queries using ALL backends"""
 # ===============================================================================
+app.include_router(search_router)
 app.include_router(scrap_router)
 uvicorn.run(app, host="0.0.0.0", port=7860)

search.py CHANGED Viewed

@@ -39,7 +39,6 @@ async def playwright_open_page(browser: Browser):
         await context.close()
-# TODO: update to return same format for results
 async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
     """Queries google patents for the specified query and number of results. Returns relevant patents"""
@@ -65,17 +64,33 @@ async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
         PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
         items = await page.locator("search-result-item").all()
-        id_matches = []
         for item in items:
             all_text = " ".join(await item.locator("span").all_inner_texts())
             found = re.findall(PATENT_ID_REGEX, all_text)
-            if found:
-                id_matches.append(found[0])
-        patents = [{"href": f"https://patents.google.com/patent/{id}/en", "id": id}
-                   for id in id_matches]
-    return patents[:n_results]
 async def query_brave_search(browser: Browser, q: str, n_results: int = 10):

         await context.close()
 async def query_google_patents(browser: Browser, q: str, n_results: int = 10):
     """Queries google patents for the specified query and number of results. Returns relevant patents"""
         PATENT_ID_REGEX = r"\b[A-Z]{2}\d{6,}(?:[A-Z]\d?)?\b"
         items = await page.locator("search-result-item").all()
+        results = []
         for item in items:
+            # Extract all inner texts from spans (still used for patent ID)
             all_text = " ".join(await item.locator("span").all_inner_texts())
             found = re.findall(PATENT_ID_REGEX, all_text)
+            if not found:
+                continue
+            # get the first match as patent ID
+            patent_id = found[0]
+            # extract patent title
+            title = await item.locator("h3, h4").first.inner_text(timeout=1000)
+            # extract patent body
+            snippet_locator = item.locator(
+                "div.abstract, div.result-snippet, .snippet, .result-text")
+            body = await snippet_locator.first.inner_text(timeout=1000)
+            results.append({
+                "id": patent_id,
+                "href": f"https://patents.google.com/patent/{patent_id}/en",
+                "title": title,
+                "body": body
+            })
+    return results[:n_results]
 async def query_brave_search(browser: Browser, q: str, n_results: int = 10):