web-server

Sleeping

pvanand commited on May 18, 2024

Commit

5e64525

verified ·

1 Parent(s): 9631168

Update helper_functions_api.py

Files changed (1) hide show

helper_functions_api.py CHANGED Viewed

@@ -149,10 +149,11 @@ class Scraper:
 def extract_main_content(html):
     if html:
         soup = BeautifulSoup(html, 'lxml')
-        paragraphs = soup.find_all('p')
-        text = ' '.join(p.get_text() for p in paragraphs)
-        return text
     return ""
 def process_content(url, query):
@@ -161,7 +162,7 @@ def process_content(url, query):
     if html_content:
         content = extract_main_content(html_content)
         if content:
-            rephrased_content = rephrase_content(remove_stopwords(content)[:4096*4], query)
             return rephrased_content, url
     return "", url

 def extract_main_content(html):
     if html:
+        plain_text = ""
         soup = BeautifulSoup(html, 'lxml')
+        for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
+            plain_text += element.get_text(separator=" ", strip=True) + "\n"
+        return plain_text
     return ""
 def process_content(url, query):
     if html_content:
         content = extract_main_content(html_content)
         if content:
+            rephrased_content = rephrase_content(limit_tokens(remove_stopwords(content)), query)
             return rephrased_content, url
     return "", url