Spaces:
Sleeping
Sleeping
Update helper_functions_api.py
Browse files- helper_functions_api.py +16 -8
helper_functions_api.py
CHANGED
|
@@ -67,6 +67,7 @@ from half_json.core import JSONFixer
|
|
| 67 |
from openai import OpenAI
|
| 68 |
from together import Together
|
| 69 |
from urllib.parse import urlparse
|
|
|
|
| 70 |
|
| 71 |
llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
|
| 72 |
llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
|
|
@@ -197,13 +198,20 @@ class Scraper:
|
|
| 197 |
return None
|
| 198 |
|
| 199 |
def extract_main_content(html):
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
def process_content(data_format, url, query):
|
| 209 |
scraper = Scraper()
|
|
@@ -213,7 +221,7 @@ def process_content(data_format, url, query):
|
|
| 213 |
if content:
|
| 214 |
rephrased_content = rephrase_content(
|
| 215 |
data_format=data_format,
|
| 216 |
-
content=limit_tokens(remove_stopwords(content), token_limit=
|
| 217 |
query=query,
|
| 218 |
)
|
| 219 |
return rephrased_content, url
|
|
|
|
| 67 |
from openai import OpenAI
|
| 68 |
from together import Together
|
| 69 |
from urllib.parse import urlparse
|
| 70 |
+
import trafilatura
|
| 71 |
|
| 72 |
llm_default_small = "meta-llama/Llama-3-8b-chat-hf"
|
| 73 |
llm_default_medium = "meta-llama/Llama-3-70b-chat-hf"
|
|
|
|
| 198 |
return None
|
| 199 |
|
| 200 |
def extract_main_content(html):
|
| 201 |
+
extracted = trafilatura.extract(
|
| 202 |
+
html,
|
| 203 |
+
output_format="markdown",
|
| 204 |
+
target_language="en",
|
| 205 |
+
include_tables=True,
|
| 206 |
+
include_images=False,
|
| 207 |
+
include_links=False,
|
| 208 |
+
deduplicate=True,
|
| 209 |
+
)
|
| 210 |
+
|
| 211 |
+
if extracted:
|
| 212 |
+
return trafilatura.utils.sanitize(extracted)
|
| 213 |
+
else:
|
| 214 |
+
return ""
|
| 215 |
|
| 216 |
def process_content(data_format, url, query):
|
| 217 |
scraper = Scraper()
|
|
|
|
| 221 |
if content:
|
| 222 |
rephrased_content = rephrase_content(
|
| 223 |
data_format=data_format,
|
| 224 |
+
content=limit_tokens(remove_stopwords(content), token_limit=4000),
|
| 225 |
query=query,
|
| 226 |
)
|
| 227 |
return rephrased_content, url
|