Spaces:
Paused
Paused
:boom: [Fix] SearchAPIApp: incorrect order of extracted contents to urls
Browse files- apis/search_api.py +15 -13
apis/search_api.py
CHANGED
|
@@ -93,33 +93,35 @@ class SearchAPIApp:
|
|
| 93 |
overwrite=overwrite_webpage_html,
|
| 94 |
output_parent=query_search_results["query"],
|
| 95 |
)
|
|
|
|
|
|
|
| 96 |
html_paths = [
|
| 97 |
str(url_and_html_path["html_path"])
|
| 98 |
for url_and_html_path in url_and_html_path_list
|
| 99 |
]
|
| 100 |
-
|
| 101 |
-
# Extract webpage contents from htmls
|
| 102 |
batch_webpage_content_extractor = BatchWebpageContentExtractor()
|
| 103 |
html_path_and_extracted_content_list = (
|
| 104 |
batch_webpage_content_extractor.extract(html_paths)
|
| 105 |
)
|
| 106 |
|
| 107 |
-
#
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
]["extracted_content"]
|
| 116 |
-
|
|
|
|
| 117 |
|
|
|
|
| 118 |
for query_result_idx, query_result in enumerate(
|
| 119 |
query_search_results["query_results"]
|
| 120 |
):
|
| 121 |
url = query_result["url"]
|
| 122 |
-
extracted_content =
|
| 123 |
queries_search_results[query_idx]["query_results"][query_result_idx][
|
| 124 |
"text"
|
| 125 |
] = extracted_content
|
|
|
|
| 93 |
overwrite=overwrite_webpage_html,
|
| 94 |
output_parent=query_search_results["query"],
|
| 95 |
)
|
| 96 |
+
|
| 97 |
+
# Extract webpage contents from htmls
|
| 98 |
html_paths = [
|
| 99 |
str(url_and_html_path["html_path"])
|
| 100 |
for url_and_html_path in url_and_html_path_list
|
| 101 |
]
|
|
|
|
|
|
|
| 102 |
batch_webpage_content_extractor = BatchWebpageContentExtractor()
|
| 103 |
html_path_and_extracted_content_list = (
|
| 104 |
batch_webpage_content_extractor.extract(html_paths)
|
| 105 |
)
|
| 106 |
|
| 107 |
+
# Build the map of url to extracted_content
|
| 108 |
+
html_path_to_url_dict = {
|
| 109 |
+
str(url_and_html_path["html_path"]): url_and_html_path["url"]
|
| 110 |
+
for url_and_html_path in url_and_html_path_list
|
| 111 |
+
}
|
| 112 |
+
url_to_extracted_content_dict = {
|
| 113 |
+
html_path_to_url_dict[
|
| 114 |
+
html_path_and_extracted_content["html_path"]
|
| 115 |
+
]: html_path_and_extracted_content["extracted_content"]
|
| 116 |
+
for html_path_and_extracted_content in html_path_and_extracted_content_list
|
| 117 |
+
}
|
| 118 |
|
| 119 |
+
# Write extracted contents (as 'text' field) to query_search_results
|
| 120 |
for query_result_idx, query_result in enumerate(
|
| 121 |
query_search_results["query_results"]
|
| 122 |
):
|
| 123 |
url = query_result["url"]
|
| 124 |
+
extracted_content = url_to_extracted_content_dict[url]
|
| 125 |
queries_search_results[query_idx]["query_results"][query_result_idx][
|
| 126 |
"text"
|
| 127 |
] = extracted_content
|