Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,12 +34,45 @@ def clean_text(content):
|
|
| 34 |
content = re.sub(r'\s+', ' ', content)
|
| 35 |
return content
|
| 36 |
|
| 37 |
-
def
|
| 38 |
-
"""
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# --- Document Reading Functions ---
|
| 45 |
|
|
@@ -156,7 +189,7 @@ def read_document(file_path, clean=True, url=""):
|
|
| 156 |
return f"Error reading PPTX: {e}", 0
|
| 157 |
elif mime == "text/html": # Handle HTML content
|
| 158 |
try:
|
| 159 |
-
soup = BeautifulSoup(file_content, '
|
| 160 |
structured_data = {
|
| 161 |
"Texts": extract_texts(soup),
|
| 162 |
"Links": extract_links(soup, url),
|
|
@@ -181,15 +214,10 @@ def download_and_process_file(url, clean=True):
|
|
| 181 |
|
| 182 |
try:
|
| 183 |
response = requests.get(url, stream=True, timeout=10)
|
| 184 |
-
response.raise_for_status() # Raise an exception for bad status codes
|
| 185 |
-
|
| 186 |
-
# Generate a safe and unique temporary filename
|
| 187 |
original_filename = os.path.basename(url)
|
| 188 |
-
# Remove invalid characters from filename
|
| 189 |
safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
|
| 190 |
temp_filename = f"{safe_filename}"
|
| 191 |
|
| 192 |
-
# Infer file extension from content type
|
| 193 |
content_type = response.headers['content-type']
|
| 194 |
ext = mimetypes.guess_extension(content_type)
|
| 195 |
if ext and not temp_filename.endswith(ext): # Append extension if not already present
|
|
@@ -199,7 +227,6 @@ def download_and_process_file(url, clean=True):
|
|
| 199 |
for chunk in response.iter_content(chunk_size=8192000):
|
| 200 |
f.write(chunk)
|
| 201 |
|
| 202 |
-
# Check if it's an image type
|
| 203 |
kind = filetype.guess(temp_filename)
|
| 204 |
if kind and kind.mime.startswith('image/'):
|
| 205 |
return f"", 0 # Return markdown image syntax if it's an image
|
|
@@ -215,72 +242,6 @@ def download_and_process_file(url, clean=True):
|
|
| 215 |
except requests.exceptions.RequestException as e:
|
| 216 |
return f"Error downloading file: {e}", 0
|
| 217 |
|
| 218 |
-
# --- Web Page Content Extraction Functions (from previous code) ---
|
| 219 |
-
|
| 220 |
-
def extract_texts(soup):
|
| 221 |
-
"""Extracts all text content from the soup."""
|
| 222 |
-
return [text for text in soup.stripped_strings]
|
| 223 |
-
|
| 224 |
-
def extract_links(soup, base_url):
|
| 225 |
-
"""Extracts all valid links from the soup."""
|
| 226 |
-
links = []
|
| 227 |
-
for link in soup.find_all('a', href=True):
|
| 228 |
-
href = link['href']
|
| 229 |
-
# Use urljoin to create an absolute URL
|
| 230 |
-
full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
|
| 231 |
-
link_text = link.get_text(strip=True) or "No Text"
|
| 232 |
-
links.append({"Text": link_text, "URL": full_url})
|
| 233 |
-
return links
|
| 234 |
-
|
| 235 |
-
def extract_images(soup, base_url):
|
| 236 |
-
"""Extracts all valid image URLs and their alt text from the soup."""
|
| 237 |
-
images = []
|
| 238 |
-
for img in soup.find_all('img', src=True):
|
| 239 |
-
img_url = img['src']
|
| 240 |
-
# Use urljoin to create an absolute URL
|
| 241 |
-
full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
|
| 242 |
-
alt_text = img.get('alt', 'No Alt Text')
|
| 243 |
-
images.append({"Alt Text": alt_text, "Image URL": full_img_url})
|
| 244 |
-
return images
|
| 245 |
-
|
| 246 |
-
def fetch_page_content(url):
|
| 247 |
-
"""Fetches the content of the page at the given URL."""
|
| 248 |
-
try:
|
| 249 |
-
response = requests.get(url, timeout=10)
|
| 250 |
-
response.raise_for_status()
|
| 251 |
-
return response.text
|
| 252 |
-
except requests.exceptions.RequestException as e:
|
| 253 |
-
return f"Error fetching the URL: {e}"
|
| 254 |
-
|
| 255 |
-
def format_detailed_output(structured_data):
|
| 256 |
-
"""Formats the structured data into a Markdown string."""
|
| 257 |
-
result = "### Structured Page Content\n\n"
|
| 258 |
-
result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
|
| 259 |
-
result += "**Links:**\n"
|
| 260 |
-
if structured_data["Links"]:
|
| 261 |
-
result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
|
| 262 |
-
else:
|
| 263 |
-
result += "No links found.\n"
|
| 264 |
-
result += "**Images:**\n"
|
| 265 |
-
if structured_data["Images"]:
|
| 266 |
-
result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
|
| 267 |
-
else:
|
| 268 |
-
result += "No images found.\n"
|
| 269 |
-
return result
|
| 270 |
-
|
| 271 |
-
def extract_page_content(url):
|
| 272 |
-
"""Extracts and formats the content of the page at the given URL."""
|
| 273 |
-
page_content = fetch_page_content(url)
|
| 274 |
-
if "Error" in page_content:
|
| 275 |
-
return page_content
|
| 276 |
-
soup = BeautifulSoup(page_content, 'html.parser')
|
| 277 |
-
structured_data = {
|
| 278 |
-
"Texts": extract_texts(soup),
|
| 279 |
-
"Links": extract_links(soup, url), # Pass the base URL
|
| 280 |
-
"Images": extract_images(soup, url) # Pass the base URL
|
| 281 |
-
}
|
| 282 |
-
return format_detailed_output(structured_data)
|
| 283 |
-
|
| 284 |
# --- Gradio Interface ---
|
| 285 |
|
| 286 |
iface = gr.Interface(
|
|
|
|
| 34 |
content = re.sub(r'\s+', ' ', content)
|
| 35 |
return content
|
| 36 |
|
| 37 |
+
def extract_texts(soup):
|
| 38 |
+
"""Extracts all text content from the soup."""
|
| 39 |
+
return [text for text in soup.stripped_strings]
|
| 40 |
+
|
| 41 |
+
def extract_links(soup, base_url):
|
| 42 |
+
"""Extracts all valid links from the soup."""
|
| 43 |
+
links = []
|
| 44 |
+
for link in soup.find_all('a', href=True):
|
| 45 |
+
href = link['href']
|
| 46 |
+
full_url = urljoin(base_url, href) if not href.startswith(("http://", "https://")) else href
|
| 47 |
+
link_text = link.get_text(strip=True) or "No Text"
|
| 48 |
+
links.append({"Text": link_text, "URL": full_url})
|
| 49 |
+
return links
|
| 50 |
+
|
| 51 |
+
def extract_images(soup, base_url):
|
| 52 |
+
"""Extracts all valid image URLs and their alt text from the soup."""
|
| 53 |
+
images = []
|
| 54 |
+
for img in soup.find_all('img', src=True):
|
| 55 |
+
img_url = img['src']
|
| 56 |
+
full_img_url = urljoin(base_url, img_url) if not img_url.startswith(("http://", "https://")) else img_url
|
| 57 |
+
alt_text = img.get('alt', 'No Alt Text')
|
| 58 |
+
images.append({"Alt Text": alt_text, "Image URL": full_img_url})
|
| 59 |
+
return images
|
| 60 |
+
|
| 61 |
+
def format_detailed_output(structured_data):
|
| 62 |
+
"""Formats the structured data into a Markdown string."""
|
| 63 |
+
result = "### Structured Page Content\n\n"
|
| 64 |
+
result += "**Texts:**\n" + (" ".join(structured_data["Texts"]) if structured_data["Texts"] else "No textual content found.") + "\n\n"
|
| 65 |
+
result += "**Links:**\n"
|
| 66 |
+
if structured_data["Links"]:
|
| 67 |
+
result += "\n".join(f"[{link['Text']}]({link['URL']})" for link in structured_data["Links"]) + "\n"
|
| 68 |
+
else:
|
| 69 |
+
result += "No links found.\n"
|
| 70 |
+
result += "**Images:**\n"
|
| 71 |
+
if structured_data["Images"]:
|
| 72 |
+
result += "\n".join(f"![{img['Alt Text']}]({img['Image URL']})" for img in structured_data["Images"]) + "\n"
|
| 73 |
+
else:
|
| 74 |
+
result += "No images found.\n"
|
| 75 |
+
return result
|
| 76 |
|
| 77 |
# --- Document Reading Functions ---
|
| 78 |
|
|
|
|
| 189 |
return f"Error reading PPTX: {e}", 0
|
| 190 |
elif mime == "text/html": # Handle HTML content
|
| 191 |
try:
|
| 192 |
+
soup = BeautifulSoup(file_content, 'lxml')
|
| 193 |
structured_data = {
|
| 194 |
"Texts": extract_texts(soup),
|
| 195 |
"Links": extract_links(soup, url),
|
|
|
|
| 214 |
|
| 215 |
try:
|
| 216 |
response = requests.get(url, stream=True, timeout=10)
|
|
|
|
|
|
|
|
|
|
| 217 |
original_filename = os.path.basename(url)
|
|
|
|
| 218 |
safe_filename = re.sub(r'[^\w\-_\. ]', '_', original_filename)
|
| 219 |
temp_filename = f"{safe_filename}"
|
| 220 |
|
|
|
|
| 221 |
content_type = response.headers['content-type']
|
| 222 |
ext = mimetypes.guess_extension(content_type)
|
| 223 |
if ext and not temp_filename.endswith(ext): # Append extension if not already present
|
|
|
|
| 227 |
for chunk in response.iter_content(chunk_size=8192000):
|
| 228 |
f.write(chunk)
|
| 229 |
|
|
|
|
| 230 |
kind = filetype.guess(temp_filename)
|
| 231 |
if kind and kind.mime.startswith('image/'):
|
| 232 |
return f"", 0 # Return markdown image syntax if it's an image
|
|
|
|
| 242 |
except requests.exceptions.RequestException as e:
|
| 243 |
return f"Error downloading file: {e}", 0
|
| 244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
# --- Gradio Interface ---
|
| 246 |
|
| 247 |
iface = gr.Interface(
|