|
|
|
|
|
|
|
|
import argparse |
|
|
import os |
|
|
from pathlib import Path |
|
|
import re |
|
|
import shutil |
|
|
import tempfile |
|
|
import uuid |
|
|
|
|
|
from markdownify import markdownify as md |
|
|
from selenium import webdriver |
|
|
from selenium.webdriver.chrome.service import Service |
|
|
from webdriver_manager.chrome import ChromeDriverManager, DriverCacheManager |
|
|
from bs4 import BeautifulSoup |
|
|
|
|
|
from project_settings import project_path |
|
|
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown |
|
|
|
|
|
|
|
|
def get_args(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument( |
|
|
"--filename", |
|
|
|
|
|
default=(project_path / "data/files/html/nxcloud.html").as_posix(), |
|
|
type=str |
|
|
) |
|
|
args = parser.parse_args() |
|
|
return args |
|
|
|
|
|
|
|
|
class HtmlPreprocess(object): |
|
|
@staticmethod |
|
|
def remove_comment(html_doc: str): |
|
|
pattern = "<!--.*?-->" |
|
|
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def remove_img(html_doc: str): |
|
|
pattern = "<img.*?>" |
|
|
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def remove_multiple_newlines(html_doc: str): |
|
|
html_doc = re.sub(r"(\n\s*\n)+", "\n", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def remove_no_script(html_doc: str): |
|
|
pattern = "<noscript>.*?</noscript>" |
|
|
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def remove_script(html_doc: str): |
|
|
pattern = "<script.*?</script>" |
|
|
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def remove_style(html_doc: str): |
|
|
remove_script_pattern = "<style.*?</style>" |
|
|
html_doc = re.sub(remove_script_pattern, "", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def remove_class_property(html_doc: str): |
|
|
pattern = " class=\".+?\"" |
|
|
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def remove_id_property(html_doc: str): |
|
|
pattern = " id=\".+?\"" |
|
|
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def remove_onclick_property(html_doc: str): |
|
|
pattern = " onclick=\".+?\"" |
|
|
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def remove_style_property(html_doc: str): |
|
|
pattern = " style=\".+?\"" |
|
|
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) |
|
|
pattern = " style='.+?'" |
|
|
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def replace_a(html_doc: str): |
|
|
pattern = r"<a\b[^>]*>(.*?)</a>" |
|
|
html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def replace_br(html_doc: str): |
|
|
pattern = r"(<br>|<br/>|<br />)" |
|
|
html_doc = re.sub(pattern, "\n", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
@staticmethod |
|
|
def replace_div(html_doc: str): |
|
|
pattern = r"<div\b[^>]*>(.*?)</div>" |
|
|
html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL) |
|
|
return html_doc |
|
|
|
|
|
|
|
|
@BaseToMarkdown.register("html_markdownify") |
|
|
class HtmlToMarkdown(BaseToMarkdown, HtmlPreprocess): |
|
|
def __init__(self, filename: str): |
|
|
super().__init__(filename) |
|
|
with open(self.filename, "r", encoding="utf-8") as f: |
|
|
html_doc = f.read() |
|
|
soup = BeautifulSoup(html_doc, "html.parser") |
|
|
self.html_doc = soup.prettify() |
|
|
|
|
|
def get_md_text(self) -> str: |
|
|
options = { |
|
|
"strip": ["script"], |
|
|
"autolinks": False, |
|
|
} |
|
|
|
|
|
html_doc = self.html_doc |
|
|
html_doc = html_doc.replace("<", "<") |
|
|
html_doc = html_doc.replace(">", ">") |
|
|
|
|
|
html_doc = self.remove_comment(html_doc) |
|
|
html_doc = self.remove_img(html_doc) |
|
|
html_doc = self.remove_no_script(html_doc) |
|
|
html_doc = self.remove_script(html_doc) |
|
|
html_doc = self.remove_style(html_doc) |
|
|
|
|
|
html_doc = self.remove_class_property(html_doc) |
|
|
html_doc = self.remove_id_property(html_doc) |
|
|
html_doc = self.remove_onclick_property(html_doc) |
|
|
html_doc = self.remove_style_property(html_doc) |
|
|
|
|
|
html_doc = self.replace_a(html_doc) |
|
|
html_doc = self.replace_br(html_doc) |
|
|
html_doc = self.replace_div(html_doc) |
|
|
|
|
|
html_doc = self.remove_multiple_newlines(html_doc) |
|
|
md_text = md(html_doc, **options) |
|
|
md_text = self.remove_multiple_newlines(md_text) |
|
|
|
|
|
return md_text |
|
|
|
|
|
def save_to_zip(self, output_dir: str): |
|
|
basename = str(uuid.uuid4()) |
|
|
|
|
|
temp_dir = Path(tempfile.gettempdir()) / basename |
|
|
temp_dir.mkdir(parents=True, exist_ok=False) |
|
|
|
|
|
md_file = temp_dir / f"{basename}.md" |
|
|
md_text = self.get_md_text() |
|
|
with open(md_file.as_posix(), "w", encoding="utf-8") as f: |
|
|
f.write(md_text) |
|
|
output_zip_file = os.path.join(output_dir, f"{basename}.zip") |
|
|
|
|
|
|
|
|
self.zip_directory(temp_dir, output_zip_file) |
|
|
shutil.rmtree(temp_dir) |
|
|
return output_zip_file |
|
|
|
|
|
|
|
|
class UrlToMarkdown(BaseToMarkdown, HtmlPreprocess): |
|
|
def __init__(self, url: str): |
|
|
super().__init__(url) |
|
|
self.url = url |
|
|
html_doc = self.get_url_content(url) |
|
|
soup = BeautifulSoup(html_doc, "html.parser") |
|
|
self.html_doc = soup.prettify() |
|
|
|
|
|
def get_url_content(self, url: str): |
|
|
chrome_driver_manager = ChromeDriverManager( |
|
|
cache_manager=DriverCacheManager( |
|
|
root_dir=(project_path / "data").as_posix() |
|
|
) |
|
|
) |
|
|
driver_path = chrome_driver_manager.install() |
|
|
print(f"driver_path: {driver_path}") |
|
|
|
|
|
driver = webdriver.Chrome( |
|
|
service=Service(driver_path=driver_path), |
|
|
) |
|
|
|
|
|
driver.get(url) |
|
|
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') |
|
|
|
|
|
html_doc = driver.page_source |
|
|
driver.quit() |
|
|
return html_doc |
|
|
|
|
|
def get_md_text(self) -> str: |
|
|
options = { |
|
|
"strip": ["script"], |
|
|
"autolinks": False, |
|
|
} |
|
|
|
|
|
html_doc = self.html_doc |
|
|
html_doc = html_doc.replace("<", "<") |
|
|
html_doc = html_doc.replace(">", ">") |
|
|
|
|
|
html_doc = self.remove_comment(html_doc) |
|
|
html_doc = self.remove_img(html_doc) |
|
|
html_doc = self.remove_no_script(html_doc) |
|
|
html_doc = self.remove_script(html_doc) |
|
|
html_doc = self.remove_style(html_doc) |
|
|
|
|
|
html_doc = self.remove_class_property(html_doc) |
|
|
html_doc = self.remove_id_property(html_doc) |
|
|
html_doc = self.remove_onclick_property(html_doc) |
|
|
html_doc = self.remove_style_property(html_doc) |
|
|
|
|
|
html_doc = self.replace_a(html_doc) |
|
|
html_doc = self.replace_br(html_doc) |
|
|
html_doc = self.replace_div(html_doc) |
|
|
|
|
|
html_doc = self.remove_multiple_newlines(html_doc) |
|
|
md_text = md(html_doc, **options) |
|
|
md_text = self.remove_multiple_newlines(md_text) |
|
|
|
|
|
return md_text |
|
|
|
|
|
def save_to_zip(self, output_dir: str): |
|
|
basename = str(uuid.uuid4()) |
|
|
|
|
|
temp_dir = Path(tempfile.gettempdir()) / basename |
|
|
temp_dir.mkdir(parents=True, exist_ok=False) |
|
|
|
|
|
md_file = temp_dir / f"{basename}.md" |
|
|
md_text = self.get_md_text() |
|
|
with open(md_file.as_posix(), "w", encoding="utf-8") as f: |
|
|
f.write(md_text) |
|
|
output_zip_file = os.path.join(output_dir, f"{basename}.zip") |
|
|
|
|
|
|
|
|
self.zip_directory(temp_dir, output_zip_file) |
|
|
shutil.rmtree(temp_dir) |
|
|
return output_zip_file |
|
|
|
|
|
|
|
|
def main(): |
|
|
args = get_args() |
|
|
|
|
|
h2m = HtmlToMarkdown(args.filename) |
|
|
|
|
|
output_zip_file = h2m.save_to_zip(output_dir=".") |
|
|
print(output_zip_file) |
|
|
return |
|
|
|
|
|
|
|
|
def main2(): |
|
|
args = get_args() |
|
|
|
|
|
h2m = UrlToMarkdown("https://www.baidu.com/") |
|
|
|
|
|
output_zip_file = h2m.save_to_zip(output_dir=".") |
|
|
print(output_zip_file) |
|
|
return |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
main2() |
|
|
|