Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -92,7 +92,7 @@ def extract_text_from_pptx(pptx_data, clean=True):
|
|
| 92 |
text = clean_text(text)
|
| 93 |
return text, len(text)
|
| 94 |
|
| 95 |
-
def read_document(file_path, clean=True):
|
| 96 |
with open(file_path, "rb") as f:
|
| 97 |
file_content = f.read()
|
| 98 |
|
|
@@ -159,8 +159,8 @@ def read_document(file_path, clean=True):
|
|
| 159 |
soup = BeautifulSoup(file_content, 'html.parser')
|
| 160 |
structured_data = {
|
| 161 |
"Texts": extract_texts(soup),
|
| 162 |
-
"Links": extract_links(soup,
|
| 163 |
-
"Images": extract_images(soup,
|
| 164 |
}
|
| 165 |
return format_detailed_output(structured_data), 0
|
| 166 |
except Exception as e:
|
|
@@ -204,7 +204,7 @@ def download_and_process_file(url, clean=True):
|
|
| 204 |
if kind and kind.mime.startswith('image/'):
|
| 205 |
return f"", 0 # Return markdown image syntax if it's an image
|
| 206 |
else:
|
| 207 |
-
return read_document(temp_filename, clean) # Otherwise, process as a document
|
| 208 |
|
| 209 |
except requests.exceptions.MissingSchema:
|
| 210 |
return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
|
|
|
|
| 92 |
text = clean_text(text)
|
| 93 |
return text, len(text)
|
| 94 |
|
| 95 |
+
def read_document(file_path, clean=True, url=""):
|
| 96 |
with open(file_path, "rb") as f:
|
| 97 |
file_content = f.read()
|
| 98 |
|
|
|
|
| 159 |
soup = BeautifulSoup(file_content, 'html.parser')
|
| 160 |
structured_data = {
|
| 161 |
"Texts": extract_texts(soup),
|
| 162 |
+
"Links": extract_links(soup, url),
|
| 163 |
+
"Images": extract_images(soup, url)
|
| 164 |
}
|
| 165 |
return format_detailed_output(structured_data), 0
|
| 166 |
except Exception as e:
|
|
|
|
| 204 |
if kind and kind.mime.startswith('image/'):
|
| 205 |
return f"", 0 # Return markdown image syntax if it's an image
|
| 206 |
else:
|
| 207 |
+
return read_document(temp_filename, clean, url) # Otherwise, process as a document
|
| 208 |
|
| 209 |
except requests.exceptions.MissingSchema:
|
| 210 |
return "Error: Invalid URL format. Even after adding 'http://', the URL is still invalid.", 0
|