Spaces:
Paused
Paused
:boom: [Fix] WebpageContentExtractor: UnicodeDecodeError
Browse files
documents/webpage_content_extractor.py
CHANGED
|
@@ -81,8 +81,17 @@ class WebpageContentExtractor:
|
|
| 81 |
logger.warn(f"File not found: {html_path}")
|
| 82 |
return ""
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
html_str = self.remove_elements_from_html(html_str)
|
| 88 |
markdown_str = self.html_to_markdown(html_str)
|
|
|
|
| 81 |
logger.warn(f"File not found: {html_path}")
|
| 82 |
return ""
|
| 83 |
|
| 84 |
+
encodings = ["utf-8", "latin-1"]
|
| 85 |
+
for encoding in encodings:
|
| 86 |
+
try:
|
| 87 |
+
with open(html_path, "r", encoding=encoding, errors="ignore") as rf:
|
| 88 |
+
html_str = rf.read()
|
| 89 |
+
break
|
| 90 |
+
except UnicodeDecodeError:
|
| 91 |
+
pass
|
| 92 |
+
else:
|
| 93 |
+
logger.warn(f"No matching encodings: {html_path}")
|
| 94 |
+
return ""
|
| 95 |
|
| 96 |
html_str = self.remove_elements_from_html(html_str)
|
| 97 |
markdown_str = self.html_to_markdown(html_str)
|