Spaces:
Sleeping
Sleeping
Update helper_functions_api.py
Browse files- helper_functions_api.py +5 -4
helper_functions_api.py
CHANGED
|
@@ -149,10 +149,11 @@ class Scraper:
|
|
| 149 |
|
| 150 |
def extract_main_content(html):
|
| 151 |
if html:
|
|
|
|
| 152 |
soup = BeautifulSoup(html, 'lxml')
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
return
|
| 156 |
return ""
|
| 157 |
|
| 158 |
def process_content(url, query):
|
|
@@ -161,7 +162,7 @@ def process_content(url, query):
|
|
| 161 |
if html_content:
|
| 162 |
content = extract_main_content(html_content)
|
| 163 |
if content:
|
| 164 |
-
rephrased_content = rephrase_content(remove_stopwords(content)
|
| 165 |
return rephrased_content, url
|
| 166 |
return "", url
|
| 167 |
|
|
|
|
| 149 |
|
| 150 |
def extract_main_content(html):
|
| 151 |
if html:
|
| 152 |
+
plain_text = ""
|
| 153 |
soup = BeautifulSoup(html, 'lxml')
|
| 154 |
+
for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'table']):
|
| 155 |
+
plain_text += element.get_text(separator=" ", strip=True) + "\n"
|
| 156 |
+
return plain_text
|
| 157 |
return ""
|
| 158 |
|
| 159 |
def process_content(url, query):
|
|
|
|
| 162 |
if html_content:
|
| 163 |
content = extract_main_content(html_content)
|
| 164 |
if content:
|
| 165 |
+
rephrased_content = rephrase_content(limit_tokens(remove_stopwords(content)), query)
|
| 166 |
return rephrased_content, url
|
| 167 |
return "", url
|
| 168 |
|