web-server

Sleeping

App Files Files Community

pvanand commited on Jun 16, 2024

Commit

ec971eb

verified ·

1 Parent(s): 129b060

Update helper_functions_api.py

Browse files

Files changed (1) hide show

helper_functions_api.py +36 -19

helper_functions_api.py CHANGED Viewed

@@ -66,6 +66,7 @@ from fuzzy_json import loads
 from half_json.core import JSONFixer
 from openai import OpenAI
 from together import Together
 llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
 llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
@@ -195,27 +196,43 @@ class Scraper:
             print(f"Error fetching page content for {url}: {e}")
         return None
-def extract_main_content(html):
-    if html:
-        plain_text = ""
-        soup = BeautifulSoup(html, 'lxml')
-        for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
-            plain_text += element.get_text(separator=" ", strip=True) + "\n"
-        return plain_text
-    return ""
 def process_content(data_format, url, query):
-    scraper = Scraper()
-    html_content = scraper.fetch_content(url)
-    if html_content:
-        content = extract_main_content(html_content)
-        if content:
-            rephrased_content = rephrase_content(
-                data_format=data_format,
-                content=limit_tokens(remove_stopwords(content), token_limit=1000),
-                query=query,
-            )
-            return rephrased_content, url
     return "", url
 def fetch_and_extract_content(data_format, urls, query):

 from half_json.core import JSONFixer
 from openai import OpenAI
 from together import Together
+from urllib.parse import urlparse
 llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
 llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
             print(f"Error fetching page content for {url}: {e}")
         return None
+def extract_main_content(url):
+    if url:
+        try:
+            result = urlparse(url)
+            if all([result.scheme, result.netloc]):
+                # Prepare query parameters
+                params = {
+                    "url": url,
+                    "favor_precision": False,
+                    "favor_recall": False,
+                    "output_format": "markdown",
+                    "target_language": "en",
+                    "include_tables": True,
+                    "include_images": False,
+                    "include_links": False,
+                    "deduplicate": True,
+                }
+                # Make request to FastAPI endpoint
+                response = requests.get("https://pvanand-web-scraping.hf.space/extract-article", params=params)
+                if response.status_code == 200:
+                    return response.json()["article"]
+                else:
+                    return ""
+        except:
+            return ""
 def process_content(data_format, url, query):
+    content = extract_main_content(url)
+    if content:
+        rephrased_content = rephrase_content(
+            data_format=data_format,
+            content=limit_tokens(remove_stopwords(content), token_limit=4000),
+            query=query,
+        )
+        return rephrased_content, url
     return "", url
 def fetch_and_extract_content(data_format, urls, query):