Spaces:
Paused
Paused
| import re | |
| from pathlib import Path | |
| from pprint import pprint | |
| from bs4 import BeautifulSoup, Comment, NavigableString, Tag | |
| from tiktoken import get_encoding as tiktoken_get_encoding | |
| from utils.logger import logger | |
| from markdownify import markdownify | |
| # from trafilatura import extract as extract_text_from_html | |
| # from inscriptis import get_text as extract_text_from_html | |
| # from html_text import extract_text as extract_text_from_html | |
| # from readabilipy import simple_json_from_html_string as extract_text_from_html | |
| class WebpageContentExtractor: | |
| def __init__(self): | |
| self.tokenizer = tiktoken_get_encoding("cl100k_base") | |
| def count_tokens(self, text): | |
| tokens = self.tokenizer.encode(text) | |
| token_count = len(tokens) | |
| return token_count | |
| def filter_html_str(self, html_str): | |
| soup = BeautifulSoup(html_str, "html.parser") | |
| ignore_tags = ["script", "style", "button"] | |
| ignore_classes = [ | |
| "sidebar", | |
| "footer", | |
| "related", | |
| "comment", | |
| "topbar", | |
| "menu", | |
| "offcanvas", | |
| "navbar", | |
| ] | |
| ignore_classes_pattern = f'{"|".join(ignore_classes)}' | |
| removed_element_counts = 0 | |
| for element in soup.find_all(): | |
| class_str = "" | |
| id_str = "" | |
| try: | |
| class_attr = element.get("class", []) | |
| if class_attr: | |
| class_str = " ".join(list(class_attr)) | |
| if id_str: | |
| class_str = f"{class_str} {id_str}" | |
| except: | |
| pass | |
| try: | |
| id_str = element.get("id", "") | |
| except: | |
| pass | |
| if ( | |
| (not element.text.strip()) | |
| or (element.name in ignore_tags) | |
| or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE)) | |
| or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE)) | |
| ): | |
| # try: | |
| # logger.note(f"Removing:\n{element}") | |
| # except: | |
| # logger.note(f"Removing unknown element") | |
| element.decompose() | |
| removed_element_counts += 1 | |
| logger.note( | |
| f"Elements Removed/Remained: {removed_element_counts}/{len(soup.find_all())}" | |
| ) | |
| html_str = str(soup) | |
| return html_str | |
| def extract(self, html_path): | |
| logger.note(f"Extracing content from:{html_path}") | |
| with open(html_path, "r", encoding="utf-8") as f: | |
| html_str = f.read() | |
| html_str = self.filter_html_str(html_str) | |
| # self.main_content = extract_text_from_html(html_str) | |
| # # when using `readabilipy` | |
| # self.main_content = extract_text_from_html(html_str)["plain_content"] | |
| # self.main_content = "\n".join( | |
| # item["text"] for item in extract_text_from_html(html_str)["plain_text"] | |
| # ) | |
| # self.main_content = markdownify(extract_text_from_html(html_str)["content"]) | |
| # self.main_content = markdownify(extract_text_from_html(html_str)) | |
| self.main_content = markdownify(html_str, strip="a") | |
| self.main_content = re.sub(r"\n{3,}", "\n\n", self.main_content) | |
| logger.line(self.main_content) | |
| # pprint(self.main_content) | |
| token_count = self.count_tokens(self.main_content) | |
| logger.note(f"Token Count: {token_count}") | |
| return self.main_content | |
| if __name__ == "__main__": | |
| html_path = ( | |
| Path(__file__).parents[1] | |
| / "files" | |
| / "urls" | |
| # / "stackoverflow.com_questions_295135_turn-a-string-into-a-valid-filename.html" | |
| / "www.liaoxuefeng.com_wiki_1016959663602400_1017495723838528.html" | |
| # / "docs.python.org_zh-cn_3_tutorial_interpreter.html" | |
| ) | |
| extractor = WebpageContentExtractor() | |
| main_content = extractor.extract(html_path) | |