#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Enhanced Text Extractor Module with CJK Support Provides superior text extraction from HTML with proper Unicode handling Optimized for Korean, Japanese, and Chinese content extraction """ import os import re import html import unicodedata from typing import Tuple, Optional import chardet # BEAUTIFUL SOUP IMPORT MONKEY FIX - Import BeautifulSoup BEFORE html2text # This prevents certain parser initialization issues try: from bs4 import BeautifulSoup # Force BeautifulSoup to initialize its parsers _ = BeautifulSoup("", 'html.parser') except ImportError: BeautifulSoup = None raise ImportError("BeautifulSoup is required. Install with: pip install beautifulsoup4") # Now import html2text AFTER BeautifulSoup try: import html2text except ImportError: html2text = None raise ImportError("html2text is required. Install with: pip install html2text") class EnhancedTextExtractor: """Enhanced text extraction with proper Unicode and CJK handling""" # Unicode preservation mappings UNICODE_QUOTES = { # Western quotes '“': '\u201c', # Left double quotation mark '”': '\u201d', # Right double quotation mark '‘': '\u2018', # Left single quotation mark '’': '\u2019', # Right single quotation mark '"': '"', # Standard double quote ''': "'", # Standard apostrophe # CJK quotes and punctuation '「': '「', # Japanese left corner bracket '」': '」', # Japanese right corner bracket '『': '『', # Japanese left white corner bracket '』': '』', # Japanese right white corner bracket '(': '(', # Fullwidth left parenthesis ')': ')', # Fullwidth right parenthesis '【': '【', # Left black lenticular bracket '】': '】', # Right black lenticular bracket '《': '《', # Left double angle bracket '》': '》', # Right double angle bracket ';': ';', # Fullwidth semicolon ':': ':', # Fullwidth colon '。': '。', # Ideographic full stop '?': '?', # Fullwidth question mark '!': '!', # Fullwidth exclamation mark '、': '、', # Ideographic comma # Numeric entities '“': '\u201c', # Left double quote (numeric) '”': '\u201d', # Right double quote (numeric) '‘': '\u2018', # Left single quote (numeric) '’': '\u2019', # Right single quote (numeric) # Common CJK entities '…': '…', # Horizontal ellipsis '—': '—', # Em dash '–': '–', # En dash ' ': '\u00A0', # Non-breaking space } # CJK-specific punctuation to preserve CJK_PUNCTUATION = { '。', '、', '!', '?', '…', '—', '~', '・', '「', '」', '『', '』', '(', ')', '【', '】', '《', '》', '〈', '〉', '〔', '〕', '[', ']', ':', ';', '"', '"', ''', ''', ',', '.', '?', '!', ':', ';', '"', '"', '‚', '„', '«', '»', } # Quote protection markers QUOTE_MARKERS = { '"': '␥', # Opening double quote marker '"': '␦', # Closing double quote marker '"': '␦', # Alternative closing quote "'": '␣', # Opening single quote marker "'": '␤', # Closing single quote marker "'": '␤', # Alternative closing quote } def __init__(self, filtering_mode: str = "smart", preserve_structure: bool = True): """Initialize the enhanced text extractor""" if not html2text: raise ImportError("html2text is required for enhanced extraction") if not BeautifulSoup: raise ImportError("BeautifulSoup is required for enhanced extraction") self.filtering_mode = filtering_mode self.preserve_structure = preserve_structure self.h2t = None self.detected_language = None self._configure_html2text() def _detect_encoding(self, content: bytes) -> str: """Detect the encoding of the content""" try: # Try chardet detection detected = chardet.detect(content) if detected['confidence'] > 0.7: return detected['encoding'] except Exception: pass # Try common CJK encodings in order for encoding in ['utf-8', 'gb2312', 'gbk', 'gb18030', 'big5', 'shift_jis', 'euc-kr', 'euc-jp']: try: content.decode(encoding) return encoding except Exception: continue return 'utf-8' # Default fallback def _detect_content_language(self, text: str) -> str: """Detect the primary language of content""" if not text: return 'unknown' # Take a sample of the text sample = text[:5000] # Count characters by script korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF) japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF)) chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF) latin_chars = sum(1 for char in sample if 0x0041 <= ord(char) <= 0x007A) # Determine primary language if korean_chars > 50: return 'korean' elif japanese_kana > 20: return 'japanese' elif chinese_chars > 50 and japanese_kana < 10: return 'chinese' elif latin_chars > 100: return 'english' else: return 'unknown' def _configure_html2text(self): """Configure html2text with optimal Unicode and CJK settings""" self.h2t = html2text.HTML2Text() # Core settings for Unicode preservation self.h2t.unicode_snob = True self.h2t.escape_snob = True self.h2t.use_automatic_links = False # Layout settings self.h2t.body_width = 0 self.h2t.single_line_break = False # Content filtering self.h2t.ignore_links = False self.h2t.ignore_images = False self.h2t.ignore_anchors = False self.h2t.skip_internal_links = False self.h2t.ignore_tables = False # Image handling - CRITICAL: Force html2text to preserve img tags as HTML self.h2t.images_as_html = True # Keep images as tags instead of ![]() self.h2t.images_to_alt = False # Don't convert to alt text only self.h2t.images_with_size = True # Include width/height attributes # Additional settings self.h2t.wrap_links = False self.h2t.wrap_list_items = False self.h2t.protect_links = True # Structure preservation settings if self.preserve_structure: self.h2t.bypass_tables = False self.h2t.ignore_emphasis = False self.h2t.mark_code = True self.h2t.ul_item_mark = '•' else: self.h2t.bypass_tables = True self.h2t.ignore_emphasis = True self.h2t.mark_code = False def _decode_entities(self, text: str) -> str: """Decode HTML entities to Unicode characters with CJK support""" if not text: return text # First pass: Apply known CJK-aware replacements for entity, unicode_char in self.UNICODE_QUOTES.items(): text = text.replace(entity, unicode_char) # Second pass: standard HTML unescape text = html.unescape(text) # Third pass: handle numeric entities def decode_decimal(match): try: code = int(match.group(1)) if code < 0x110000: return chr(code) except Exception: pass return match.group(0) def decode_hex(match): try: code = int(match.group(1), 16) if code < 0x110000: return chr(code) except Exception: pass return match.group(0) text = re.sub(r'&#(\d+);?', decode_decimal, text) text = re.sub(r'&#x([0-9a-fA-F]+);?', decode_hex, text) # Fourth pass: handle special CJK entities cjk_special_entities = { '⟨': '〈', '⟩': '〉', '⌈': '⌈', '⌉': '⌉', '⌊': '⌊', '⌋': '⌋', } for entity, char in cjk_special_entities.items(): text = text.replace(entity, char) return text def _normalize_unicode(self, text: str) -> str: """Normalize Unicode with CJK awareness""" if self.detected_language in ['korean', 'japanese', 'chinese']: return text else: return unicodedata.normalize('NFC', text) def _protect_quotes(self, text: str) -> str: """Protect quotes by replacing with special markers""" for original, marker in self.QUOTE_MARKERS.items(): text = text.replace(original, marker) return text def _restore_quotes(self, text: str) -> str: """Restore quotes from special markers""" for original, marker in self.QUOTE_MARKERS.items(): text = text.replace(marker, original) return text def _preprocess_html_for_quotes(self, html_content: str) -> str: """Pre-process HTML to protect quotes from conversion""" def protect_quotes_in_text(match): text = match.group(1) return f'>{self._protect_quotes(text)}<' # Apply to all text between tags html_content = re.sub(r'>([^<]+)<', protect_quotes_in_text, html_content) return html_content def _protect_quotes_in_soup(self, soup: BeautifulSoup) -> None: """Protect quotes in BeautifulSoup object before processing""" for element in soup.find_all(string=True): if element.parent.name not in ['script', 'style', 'noscript']: original_text = str(element) protected_text = self._protect_quotes(original_text) element.replace_with(protected_text) def _minimal_parser_fix(self, html_content: str) -> str: """Apply minimal fixes only for parser errors""" # Fix tags with ="" pattern html_content = re.sub(r'<[^>]*?=\s*""\s*[^>]*?>', '', html_content) # Fix malformed closing tags html_content = re.sub(r'', r'', html_content) html_content = re.sub(r'', '', html_content) html_content = re.sub(r'', r'', html_content) # Fix orphaned brackets html_content = re.sub(r'<(?![a-zA-Z/!?])', '<', html_content) html_content = re.sub(r'(?', '>', html_content) # Fix unclosed tags at the end if html_content.rstrip().endswith('<'): html_content = html_content.rstrip()[:-1] # Remove nested opening brackets html_content = re.sub(r'<[^>]*?<[^>]*?>', '', html_content) return html_content def _clean_text_cjk_aware(self, text: str, preserve_structure: bool) -> str: """Clean extracted text with CJK awareness""" if not preserve_structure and self.detected_language not in ['korean', 'japanese', 'chinese']: # Only do aggressive cleanup for non-CJK text text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE) text = re.sub(r'\*\*(.*?)\*\*', r'\1', text) text = re.sub(r'\*(.*?)\*', r'\1', text) text = re.sub(r'__(.*?)__', r'\1', text) text = re.sub(r'_(.*?)_', r'\1', text) text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', '', text) text = re.sub(r'`([^`]+)`', r'\1', text) text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL) text = re.sub(r'^[-*+]\s+', '', text, flags=re.MULTILINE) text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE) text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE) text = re.sub(r'^[-_*]{3,}$', '', text, flags=re.MULTILINE) # Clean whitespace if self.detected_language in ['korean', 'japanese', 'chinese']: text = re.sub(r'\n{3,}', '\n\n', text) text = re.sub(r'[ ]{3,}', ' ', text) else: text = re.sub(r'\n{3,}', '\n\n', text) text = re.sub(r' {2,}', ' ', text) # Remove invisible characters invisible_chars = ['\u200b', '\u200c', '\u200d', '\ufeff', '\u2060'] for char in invisible_chars: text = text.replace(char, '') return text.strip() def _extract_title(self, soup: BeautifulSoup) -> Optional[str]: """Extract chapter title from various sources""" # Try title tag first if soup.title and soup.title.string: title = soup.title.string.strip() title = self._decode_entities(title) return title # Try headers in order for header_tag in ['h1', 'h2', 'h3', 'h4']: headers = soup.find_all(header_tag) for header in headers: title = header.get_text(strip=True) if title: title = self._decode_entities(title) if self._is_chapter_title(title): return title return None def _is_chapter_title(self, text: str) -> bool: """Check if text looks like a chapter title""" if not text or len(text) > 200: return False # Common chapter patterns patterns = [ r'第.{1,10}[章回話话]', r'Chapter\s+\d+', r'제\s*\d+\s*화', r'第\d+話', r'\d+\s*화', r'EP\.?\s*\d+', r'Part\s+\d+', ] for pattern in patterns: if re.search(pattern, text, re.IGNORECASE): return True # Check if it's short and doesn't contain too much punctuation if len(text) < 100: punct_count = sum(1 for c in text if c in '.,;:!?。、!?') if punct_count < len(text) * 0.2: return True return False def _extract_body_content(self, soup: BeautifulSoup, full_html: str) -> str: """Extract body content while preserving Unicode""" # Remove script and style elements first for element in soup(['script', 'style', 'noscript']): element.decompose() if soup.body: return str(soup.body) else: return str(soup) def extract_chapter_content(self, html_content: str, extraction_mode: str = None) -> Tuple[str, str, Optional[str]]: """Extract chapter content with proper Unicode and CJK handling""" try: # Use instance filtering_mode if not overridden if extraction_mode is None: extraction_mode = self.filtering_mode # Handle encoding if content is bytes if isinstance(html_content, bytes): encoding = self._detect_encoding(html_content) html_content = html_content.decode(encoding, errors='replace') # Pre-process HTML to protect quotes html_content = self._preprocess_html_for_quotes(html_content) # Pre-process HTML to decode all entities html_content = self._decode_entities(html_content) # Detect language early self.detected_language = self._detect_content_language(html_content) print(f"🌐 Detected language: {self.detected_language}") # Parse with BeautifulSoup parser = 'html.parser' if self.detected_language in ['korean', 'japanese', 'chinese']: # For CJK content, lxml might handle encoding better if available try: import lxml parser = 'lxml' except ImportError: pass soup = BeautifulSoup(html_content, parser) # Protect quotes before any processing self._protect_quotes_in_soup(soup) # Extract title chapter_title = self._extract_title(soup) # Respect GUI toggles to exclude headers/titles BEFORE conversion try: batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active if ignore_title_tag and soup.title: # Remove so it isn't included when using full extraction soup.title.decompose() if ignore_header_tags: # Remove visible headers from body prior to conversion for tag_name in ['h1', 'h2', 'h3']: for hdr in soup.find_all(tag_name): hdr.decompose() except Exception: # Non-fatal – proceed with original soup if anything goes wrong pass # Determine content to convert (after removals) if extraction_mode == "full": content_to_convert = str(soup) else: content_to_convert = self._extract_body_content(soup, html_content) # Convert using html2text content_to_convert = self._decode_entities(content_to_convert) # Convert to text with error handling try: clean_text = self.h2t.handle(content_to_convert) except (AssertionError, UnboundLocalError) as e: error_msg = str(e) if "cannot access local variable" in error_msg or "we should not get here!" in error_msg or "unexpected call to parse_endtag" in error_msg or "unexpected call to parse_starttag" in error_msg: print(f"⚠️ html2text encountered malformed HTML: {error_msg}") print(f"⚠️ Applying minimal fixes...") # Apply minimal fixes content_to_convert = self._minimal_parser_fix(content_to_convert) try: clean_text = self.h2t.handle(content_to_convert) print(f"✅ Successfully processed after minimal fixes") except Exception as e2: print(f"⚠️ html2text still failing: {e2}") # Last resort fallback clean_text = soup.get_text(separator='\n', strip=True) print(f"✅ Used BeautifulSoup fallback") else: # Re-raise if it's a different error raise except Exception as e: print(f"⚠️ Unexpected error in html2text: {e}") # Fallback to BeautifulSoup clean_text = soup.get_text(separator='\n', strip=True) # Normalize only if appropriate clean_text = self._normalize_unicode(clean_text) # Clean based on settings and language clean_text = self._clean_text_cjk_aware(clean_text, self.preserve_structure) # Restore protected quotes clean_text = self._restore_quotes(clean_text) # For enhanced mode, both display and translation content are the same return clean_text, clean_text, chapter_title except Exception as e: print(f"❌ Enhanced extraction failed: {e}") raise # Test function def test_cjk_preservation(): """Test that CJK characters and quotes are properly preserved""" test_cases = [ # Korean test with quotes '''<html> <head><title>제국의 붉은 사신

"왜 이러는 겁니까? 우리가 무슨 잘못을 했다고!"

"......"

"한 번만 살려주시오! 가족을 지키려면 어쩔 수 없었소!"

"응애! 응애! 응애!"

"미안하구나. 모든 죄는 내가 짊어지고 사마."

''' # Japanese test with quotes ''' 第1話:始まり

第1話:始まり

「こんにちは!これは日本語のテストです。」

彼は言った。「これで全部ですか?」

「はい、そうです」と答えた。

''', # Chinese test with quotes ''' 第一章:开始

第一章:开始

"你好!这是中文测试。"

他说:"这就是全部吗?"

"是的,"她回答道。

''', ] extractor = EnhancedTextExtractor() print("=== CJK and Quote Preservation Test ===\n") for i, test_html in enumerate(test_cases, 1): print(f"--- Test Case {i} ---") try: content, _, title = extractor.extract_chapter_content(test_html) print(f"Title: {title}") print(f"Content:\n{content}\n") # Check for quotes preservation quote_checks = [ ('"', 'Western double quotes'), ('「', 'Japanese left bracket'), ('」', 'Japanese right bracket'), ('“', 'Chinese double quote'), ] print("Quote preservation check:") quote_found = False for quote_char, desc in quote_checks: if quote_char in content: print(f" ✓ Found {desc}: {quote_char}") quote_found = True if not quote_found: print(" ❌ No quotes found!") else: print(" ✅ Quotes preserved successfully!") # Check for image tag preservation (html2text now preserves them natively) img_count = content.count(' 0: print(f" ✓ Found {img_count} HTML img tags (preserved natively by html2text)") print(" ✅ Image tags preserved successfully!") else: print(" ℹ️ No images in this test case") except Exception as e: print(f"Error processing test case {i}: {e}") print("-" * 50 + "\n") if __name__ == "__main__": test_cjk_preservation()