Spaces:

Hansimov
/

web-search-api

Paused

App Files Files Community

Hansimov commited on Jan 11, 2024

Commit

e92817a

1 Parent(s): 876e441

:gem: [Feature] New BatchWebpageFetcher: Fetch multiple urls concurrently

Browse files

Files changed (1) hide show

networks/webpage_fetcher.py +38 -6

networks/webpage_fetcher.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import requests
 import tldextract
 from pathlib import Path
@@ -53,11 +54,42 @@ class WebpageFetcher:
         return self.output_path
 if __name__ == "__main__":
-    url = (
-        # "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename"
-        # "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528"
-        "https://docs.python.org/zh-cn/3/tutorial/interpreter.html"
     )
-    fetcher = WebpageFetcher()
-    fetcher.fetch(url)

+import concurrent.futures
 import requests
 import tldextract
 from pathlib import Path
         return self.output_path
+class BatchWebpageFetcher:
+    def __init__(self):
+        self.done_count = 0
+        self.total_count = 0
+    def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
+        webpage_fetcher = WebpageFetcher()
+        webpage_fetcher.fetch(url=url, overwrite=overwrite, output_parent=output_parent)
+        self.done_count += 1
+        logger.success(f"> {self.done_count}/{self.total_count}: {url}")
+    def fetch(self, urls, overwrite=False, output_parent=None):
+        self.urls = urls
+        self.total_count = len(self.urls)
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            futures = [
+                executor.submit(
+                    self.fecth_single_webpage,
+                    url=url,
+                    overwrite=overwrite,
+                    output_parent=output_parent,
+                )
+                for url in urls
+            ]
+            for idx, future in enumerate(concurrent.futures.as_completed(futures)):
+                result = future.result()
 if __name__ == "__main__":
+    urls = [
+        "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
+        "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
+        "https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
+    ]
+    batch_webpage_fetcher = BatchWebpageFetcher()
+    batch_webpage_fetcher.fetch(
+        urls=urls, overwrite=True, output_parent="python tutorials"
     )