Spaces:
Paused
Paused
:zap: [Enhance] SearchAPIApp: overwrite param for query and webpage HTML
Browse files- apis/search_api.py +18 -3
apis/search_api.py
CHANGED
|
@@ -47,6 +47,14 @@ class SearchAPIApp:
|
|
| 47 |
default=False,
|
| 48 |
description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
|
| 49 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
def queries_to_search_results(self, item: QueriesToSearchResultsPostItem):
|
| 52 |
google_searcher = GoogleSearcher()
|
|
@@ -56,7 +64,10 @@ class SearchAPIApp:
|
|
| 56 |
if not query.strip():
|
| 57 |
continue
|
| 58 |
query_html_path = google_searcher.search(
|
| 59 |
-
query=query,
|
|
|
|
|
|
|
|
|
|
| 60 |
)
|
| 61 |
query_search_results = query_results_extractor.extract(query_html_path)
|
| 62 |
queries_search_results.append(query_search_results)
|
|
@@ -69,8 +80,12 @@ class SearchAPIApp:
|
|
| 69 |
for query_result_idx, query_result in enumerate(
|
| 70 |
query_search_result["query_results"]
|
| 71 |
):
|
| 72 |
-
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
queries_search_results[query_idx]["query_results"][
|
| 75 |
query_result_idx
|
| 76 |
]["text"] = extracted_content
|
|
|
|
| 47 |
default=False,
|
| 48 |
description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
|
| 49 |
)
|
| 50 |
+
overwrite_query_html: bool = Field(
|
| 51 |
+
default=False,
|
| 52 |
+
description="(bool) Overwrite HTML file of query results",
|
| 53 |
+
)
|
| 54 |
+
overwrite_webpage_html: bool = Field(
|
| 55 |
+
default=False,
|
| 56 |
+
description="(bool) Overwrite HTML files of webpages from query results",
|
| 57 |
+
)
|
| 58 |
|
| 59 |
def queries_to_search_results(self, item: QueriesToSearchResultsPostItem):
|
| 60 |
google_searcher = GoogleSearcher()
|
|
|
|
| 64 |
if not query.strip():
|
| 65 |
continue
|
| 66 |
query_html_path = google_searcher.search(
|
| 67 |
+
query=query,
|
| 68 |
+
result_num=item.result_num,
|
| 69 |
+
safe=item.safe,
|
| 70 |
+
overwrite=item.overwrite_query_html,
|
| 71 |
)
|
| 72 |
query_search_results = query_results_extractor.extract(query_html_path)
|
| 73 |
queries_search_results.append(query_search_results)
|
|
|
|
| 80 |
for query_result_idx, query_result in enumerate(
|
| 81 |
query_search_result["query_results"]
|
| 82 |
):
|
| 83 |
+
webpage_html_path = html_fetcher.fetch(
|
| 84 |
+
query_result["url"], overwrite=item.overwrite_webpage_html
|
| 85 |
+
)
|
| 86 |
+
extracted_content = webpage_content_extractor.extract(
|
| 87 |
+
webpage_html_path
|
| 88 |
+
)
|
| 89 |
queries_search_results[query_idx]["query_results"][
|
| 90 |
query_result_idx
|
| 91 |
]["text"] = extracted_content
|