Spaces:

intelli-zen
/

document_loaders

Paused

App Files Files Community

document_loaders / toolbox /to_markdown /html_to_markdown.py

HoneyTian

first commit

e94100d 11 months ago

raw

history blame contribute delete

8.29 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	import argparse
	import os
	from pathlib import Path
	import re
	import shutil
	import tempfile
	import uuid

	from markdownify import markdownify as md
	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from webdriver_manager.chrome import ChromeDriverManager, DriverCacheManager
	from bs4 import BeautifulSoup

	from project_settings import project_path
	from toolbox.to_markdown.base_to_markdown import BaseToMarkdown


	def get_args():
	parser = argparse.ArgumentParser()
	parser.add_argument(
	"--filename",
	# default=(project_path / "data/files/html/nxlink.html").as_posix(),
	default=(project_path / "data/files/html/nxcloud.html").as_posix(),
	type=str
	)
	args = parser.parse_args()
	return args


	class HtmlPreprocess(object):
	@staticmethod
	def remove_comment(html_doc: str):
	pattern = "<!--.*?-->"
	html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def remove_img(html_doc: str):
	pattern = "<img.*?>"
	html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def remove_multiple_newlines(html_doc: str):
	html_doc = re.sub(r"(\n\s*\n)+", "\n", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def remove_no_script(html_doc: str):
	pattern = "<noscript>.*?</noscript>"
	html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def remove_script(html_doc: str):
	pattern = "<script.*?</script>"
	html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def remove_style(html_doc: str):
	remove_script_pattern = "<style.*?</style>"
	html_doc = re.sub(remove_script_pattern, "", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def remove_class_property(html_doc: str):
	pattern = " class=\".+?\""
	html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def remove_id_property(html_doc: str):
	pattern = " id=\".+?\""
	html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def remove_onclick_property(html_doc: str):
	pattern = " onclick=\".+?\""
	html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def remove_style_property(html_doc: str):
	pattern = " style=\".+?\""
	html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
	pattern = " style='.+?'"
	html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def replace_a(html_doc: str):
	pattern = r"<a\b[^>]>(.?)</a>"
	html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def replace_br(html_doc: str):
	pattern = r"(<br>\|<br/>\|<br />)"
	html_doc = re.sub(pattern, "\n", html_doc, flags=re.DOTALL)
	return html_doc

	@staticmethod
	def replace_div(html_doc: str):
	pattern = r"<div\b[^>]>(.?)</div>"
	html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL)
	return html_doc


	@BaseToMarkdown.register("html_markdownify")
	class HtmlToMarkdown(BaseToMarkdown, HtmlPreprocess):
	def __init__(self, filename: str):
	super().__init__(filename)
	with open(self.filename, "r", encoding="utf-8") as f:
	html_doc = f.read()
	soup = BeautifulSoup(html_doc, "html.parser")
	self.html_doc = soup.prettify()

	def get_md_text(self) -> str:
	options = {
	"strip": ["script"],
	"autolinks": False,
	}

	html_doc = self.html_doc
	html_doc = html_doc.replace("<", "<")
	html_doc = html_doc.replace(">", ">")

	html_doc = self.remove_comment(html_doc)
	html_doc = self.remove_img(html_doc)
	html_doc = self.remove_no_script(html_doc)
	html_doc = self.remove_script(html_doc)
	html_doc = self.remove_style(html_doc)

	html_doc = self.remove_class_property(html_doc)
	html_doc = self.remove_id_property(html_doc)
	html_doc = self.remove_onclick_property(html_doc)
	html_doc = self.remove_style_property(html_doc)

	html_doc = self.replace_a(html_doc)
	html_doc = self.replace_br(html_doc)
	html_doc = self.replace_div(html_doc)

	html_doc = self.remove_multiple_newlines(html_doc)
	md_text = md(html_doc, **options)
	md_text = self.remove_multiple_newlines(md_text)

	return md_text

	def save_to_zip(self, output_dir: str):
	basename = str(uuid.uuid4())

	temp_dir = Path(tempfile.gettempdir()) / basename
	temp_dir.mkdir(parents=True, exist_ok=False)

	md_file = temp_dir / f"{basename}.md"
	md_text = self.get_md_text()
	with open(md_file.as_posix(), "w", encoding="utf-8") as f:
	f.write(md_text)
	output_zip_file = os.path.join(output_dir, f"{basename}.zip")

	# zip
	self.zip_directory(temp_dir, output_zip_file)
	shutil.rmtree(temp_dir)
	return output_zip_file


	class UrlToMarkdown(BaseToMarkdown, HtmlPreprocess):
	def __init__(self, url: str):
	super().__init__(url)
	self.url = url
	html_doc = self.get_url_content(url)
	soup = BeautifulSoup(html_doc, "html.parser")
	self.html_doc = soup.prettify()

	def get_url_content(self, url: str):
	chrome_driver_manager = ChromeDriverManager(
	cache_manager=DriverCacheManager(
	root_dir=(project_path / "data").as_posix()
	)
	)
	driver_path = chrome_driver_manager.install()
	print(f"driver_path: {driver_path}")

	driver = webdriver.Chrome(
	service=Service(driver_path=driver_path),
	)

	driver.get(url)
	driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')

	html_doc = driver.page_source
	driver.quit()
	return html_doc

	def get_md_text(self) -> str:
	options = {
	"strip": ["script"],
	"autolinks": False,
	}

	html_doc = self.html_doc
	html_doc = html_doc.replace("<", "<")
	html_doc = html_doc.replace(">", ">")

	html_doc = self.remove_comment(html_doc)
	html_doc = self.remove_img(html_doc)
	html_doc = self.remove_no_script(html_doc)
	html_doc = self.remove_script(html_doc)
	html_doc = self.remove_style(html_doc)

	html_doc = self.remove_class_property(html_doc)
	html_doc = self.remove_id_property(html_doc)
	html_doc = self.remove_onclick_property(html_doc)
	html_doc = self.remove_style_property(html_doc)

	html_doc = self.replace_a(html_doc)
	html_doc = self.replace_br(html_doc)
	html_doc = self.replace_div(html_doc)

	html_doc = self.remove_multiple_newlines(html_doc)
	md_text = md(html_doc, **options)
	md_text = self.remove_multiple_newlines(md_text)

	return md_text

	def save_to_zip(self, output_dir: str):
	basename = str(uuid.uuid4())

	temp_dir = Path(tempfile.gettempdir()) / basename
	temp_dir.mkdir(parents=True, exist_ok=False)

	md_file = temp_dir / f"{basename}.md"
	md_text = self.get_md_text()
	with open(md_file.as_posix(), "w", encoding="utf-8") as f:
	f.write(md_text)
	output_zip_file = os.path.join(output_dir, f"{basename}.zip")

	# zip
	self.zip_directory(temp_dir, output_zip_file)
	shutil.rmtree(temp_dir)
	return output_zip_file


	def main():
	args = get_args()

	h2m = HtmlToMarkdown(args.filename)

	output_zip_file = h2m.save_to_zip(output_dir=".")
	print(output_zip_file)
	return


	def main2():
	args = get_args()

	h2m = UrlToMarkdown("https://www.baidu.com/")

	output_zip_file = h2m.save_to_zip(output_dir=".")
	print(output_zip_file)
	return


	if __name__ == "__main__":
	# main()
	main2()