#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Enhanced Text Extractor Module with CJK Support
Provides superior text extraction from HTML with proper Unicode handling
Optimized for Korean, Japanese, and Chinese content extraction
"""
import os
import re
import html
import unicodedata
from typing import Tuple, Optional
import chardet
# BEAUTIFUL SOUP IMPORT MONKEY FIX - Import BeautifulSoup BEFORE html2text
# This prevents certain parser initialization issues
try:
from bs4 import BeautifulSoup
# Force BeautifulSoup to initialize its parsers
_ = BeautifulSoup("", 'html.parser')
except ImportError:
BeautifulSoup = None
raise ImportError("BeautifulSoup is required. Install with: pip install beautifulsoup4")
# Now import html2text AFTER BeautifulSoup
try:
import html2text
except ImportError:
html2text = None
raise ImportError("html2text is required. Install with: pip install html2text")
class EnhancedTextExtractor:
"""Enhanced text extraction with proper Unicode and CJK handling"""
# Unicode preservation mappings
UNICODE_QUOTES = {
# Western quotes
'“': '\u201c', # Left double quotation mark
'”': '\u201d', # Right double quotation mark
'‘': '\u2018', # Left single quotation mark
'’': '\u2019', # Right single quotation mark
'"': '"', # Standard double quote
''': "'", # Standard apostrophe
# CJK quotes and punctuation
'「': '「', # Japanese left corner bracket
'」': '」', # Japanese right corner bracket
'『': '『', # Japanese left white corner bracket
'』': '』', # Japanese right white corner bracket
'(': '(', # Fullwidth left parenthesis
')': ')', # Fullwidth right parenthesis
'【': '【', # Left black lenticular bracket
'】': '】', # Right black lenticular bracket
'《': '《', # Left double angle bracket
'》': '》', # Right double angle bracket
';': ';', # Fullwidth semicolon
':': ':', # Fullwidth colon
'。': '。', # Ideographic full stop
'?': '?', # Fullwidth question mark
'!': '!', # Fullwidth exclamation mark
'、': '、', # Ideographic comma
# Numeric entities
'“': '\u201c', # Left double quote (numeric)
'”': '\u201d', # Right double quote (numeric)
'‘': '\u2018', # Left single quote (numeric)
'’': '\u2019', # Right single quote (numeric)
# Common CJK entities
'…': '…', # Horizontal ellipsis
'—': '—', # Em dash
'–': '–', # En dash
' ': '\u00A0', # Non-breaking space
}
# CJK-specific punctuation to preserve
CJK_PUNCTUATION = {
'。', '、', '!', '?', '…', '—', '~', '・',
'「', '」', '『', '』', '(', ')', '【', '】',
'《', '》', '〈', '〉', '〔', '〕', '[', ']',
':', ';', '"', '"', ''', ''',
',', '.', '?', '!', ':', ';',
'"', '"', '‚', '„', '«', '»',
}
# Quote protection markers
QUOTE_MARKERS = {
'"': '␥', # Opening double quote marker
'"': '␦', # Closing double quote marker
'"': '␦', # Alternative closing quote
"'": '␣', # Opening single quote marker
"'": '', # Closing single quote marker
"'": '', # Alternative closing quote
}
def __init__(self, filtering_mode: str = "smart", preserve_structure: bool = True):
"""Initialize the enhanced text extractor"""
if not html2text:
raise ImportError("html2text is required for enhanced extraction")
if not BeautifulSoup:
raise ImportError("BeautifulSoup is required for enhanced extraction")
self.filtering_mode = filtering_mode
self.preserve_structure = preserve_structure
self.h2t = None
self.detected_language = None
self._configure_html2text()
def _detect_encoding(self, content: bytes) -> str:
"""Detect the encoding of the content"""
try:
# Try chardet detection
detected = chardet.detect(content)
if detected['confidence'] > 0.7:
return detected['encoding']
except Exception:
pass
# Try common CJK encodings in order
for encoding in ['utf-8', 'gb2312', 'gbk', 'gb18030', 'big5', 'shift_jis', 'euc-kr', 'euc-jp']:
try:
content.decode(encoding)
return encoding
except Exception:
continue
return 'utf-8' # Default fallback
def _detect_content_language(self, text: str) -> str:
"""Detect the primary language of content"""
if not text:
return 'unknown'
# Take a sample of the text
sample = text[:5000]
# Count characters by script
korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF)
japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF))
chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF)
latin_chars = sum(1 for char in sample if 0x0041 <= ord(char) <= 0x007A)
# Determine primary language
if korean_chars > 50:
return 'korean'
elif japanese_kana > 20:
return 'japanese'
elif chinese_chars > 50 and japanese_kana < 10:
return 'chinese'
elif latin_chars > 100:
return 'english'
else:
return 'unknown'
def _configure_html2text(self):
"""Configure html2text with optimal Unicode and CJK settings"""
self.h2t = html2text.HTML2Text()
# Core settings for Unicode preservation
self.h2t.unicode_snob = True
self.h2t.escape_snob = True
self.h2t.use_automatic_links = False
# Layout settings
self.h2t.body_width = 0
self.h2t.single_line_break = False
# Content filtering
self.h2t.ignore_links = False
self.h2t.ignore_images = False
self.h2t.ignore_anchors = False
self.h2t.skip_internal_links = False
self.h2t.ignore_tables = False
# Image handling - CRITICAL: Force html2text to preserve img tags as HTML
self.h2t.images_as_html = True # Keep images as tags instead of ![]()
self.h2t.images_to_alt = False # Don't convert to alt text only
self.h2t.images_with_size = True # Include width/height attributes
# Additional settings
self.h2t.wrap_links = False
self.h2t.wrap_list_items = False
self.h2t.protect_links = True
# Structure preservation settings
if self.preserve_structure:
self.h2t.bypass_tables = False
self.h2t.ignore_emphasis = False
self.h2t.mark_code = True
self.h2t.ul_item_mark = '•'
else:
self.h2t.bypass_tables = True
self.h2t.ignore_emphasis = True
self.h2t.mark_code = False
def _decode_entities(self, text: str) -> str:
"""Decode HTML entities to Unicode characters with CJK support"""
if not text:
return text
# First pass: Apply known CJK-aware replacements
for entity, unicode_char in self.UNICODE_QUOTES.items():
text = text.replace(entity, unicode_char)
# Second pass: standard HTML unescape
text = html.unescape(text)
# Third pass: handle numeric entities
def decode_decimal(match):
try:
code = int(match.group(1))
if code < 0x110000:
return chr(code)
except Exception:
pass
return match.group(0)
def decode_hex(match):
try:
code = int(match.group(1), 16)
if code < 0x110000:
return chr(code)
except Exception:
pass
return match.group(0)
text = re.sub(r'(\d+);?', decode_decimal, text)
text = re.sub(r'([0-9a-fA-F]+);?', decode_hex, text)
# Fourth pass: handle special CJK entities
cjk_special_entities = {
'〈': '〈', '〉': '〉',
'⌈': '⌈', '⌉': '⌉',
'⌊': '⌊', '⌋': '⌋',
}
for entity, char in cjk_special_entities.items():
text = text.replace(entity, char)
return text
def _normalize_unicode(self, text: str) -> str:
"""Normalize Unicode with CJK awareness"""
if self.detected_language in ['korean', 'japanese', 'chinese']:
return text
else:
return unicodedata.normalize('NFC', text)
def _protect_quotes(self, text: str) -> str:
"""Protect quotes by replacing with special markers"""
for original, marker in self.QUOTE_MARKERS.items():
text = text.replace(original, marker)
return text
def _restore_quotes(self, text: str) -> str:
"""Restore quotes from special markers"""
for original, marker in self.QUOTE_MARKERS.items():
text = text.replace(marker, original)
return text
def _preprocess_html_for_quotes(self, html_content: str) -> str:
"""Pre-process HTML to protect quotes from conversion"""
def protect_quotes_in_text(match):
text = match.group(1)
return f'>{self._protect_quotes(text)}<'
# Apply to all text between tags
html_content = re.sub(r'>([^<]+)<', protect_quotes_in_text, html_content)
return html_content
def _protect_quotes_in_soup(self, soup: BeautifulSoup) -> None:
"""Protect quotes in BeautifulSoup object before processing"""
for element in soup.find_all(string=True):
if element.parent.name not in ['script', 'style', 'noscript']:
original_text = str(element)
protected_text = self._protect_quotes(original_text)
element.replace_with(protected_text)
def _minimal_parser_fix(self, html_content: str) -> str:
"""Apply minimal fixes only for parser errors"""
# Fix tags with ="" pattern
html_content = re.sub(r'<[^>]*?=\s*""\s*[^>]*?>', '', html_content)
# Fix malformed closing tags
html_content = re.sub(r'\s+(\w+)>', r'\1>', html_content)
html_content = re.sub(r'\s*>', '', html_content)
html_content = re.sub(r'/+(\w+)>', r'\1>', html_content)
# Fix orphaned brackets
html_content = re.sub(r'<(?![a-zA-Z/!?])', '<', html_content)
html_content = re.sub(r'(?', '>', html_content)
# Fix unclosed tags at the end
if html_content.rstrip().endswith('<'):
html_content = html_content.rstrip()[:-1]
# Remove nested opening brackets
html_content = re.sub(r'<[^>]*?<[^>]*?>', '', html_content)
return html_content
def _clean_text_cjk_aware(self, text: str, preserve_structure: bool) -> str:
"""Clean extracted text with CJK awareness"""
if not preserve_structure and self.detected_language not in ['korean', 'japanese', 'chinese']:
# Only do aggressive cleanup for non-CJK text
text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
text = re.sub(r'\*(.*?)\*', r'\1', text)
text = re.sub(r'__(.*?)__', r'\1', text)
text = re.sub(r'_(.*?)_', r'\1', text)
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', '', text)
text = re.sub(r'`([^`]+)`', r'\1', text)
text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
text = re.sub(r'^[-*+]\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
text = re.sub(r'^[-_*]{3,}$', '', text, flags=re.MULTILINE)
# Clean whitespace
if self.detected_language in ['korean', 'japanese', 'chinese']:
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r'[ ]{3,}', ' ', text)
else:
text = re.sub(r'\n{3,}', '\n\n', text)
text = re.sub(r' {2,}', ' ', text)
# Remove invisible characters
invisible_chars = ['\u200b', '\u200c', '\u200d', '\ufeff', '\u2060']
for char in invisible_chars:
text = text.replace(char, '')
return text.strip()
def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
"""Extract chapter title from various sources"""
# Try title tag first
if soup.title and soup.title.string:
title = soup.title.string.strip()
title = self._decode_entities(title)
return title
# Try headers in order
for header_tag in ['h1', 'h2', 'h3', 'h4']:
headers = soup.find_all(header_tag)
for header in headers:
title = header.get_text(strip=True)
if title:
title = self._decode_entities(title)
if self._is_chapter_title(title):
return title
return None
def _is_chapter_title(self, text: str) -> bool:
"""Check if text looks like a chapter title"""
if not text or len(text) > 200:
return False
# Common chapter patterns
patterns = [
r'第.{1,10}[章回話话]',
r'Chapter\s+\d+',
r'제\s*\d+\s*화',
r'第\d+話',
r'\d+\s*화',
r'EP\.?\s*\d+',
r'Part\s+\d+',
]
for pattern in patterns:
if re.search(pattern, text, re.IGNORECASE):
return True
# Check if it's short and doesn't contain too much punctuation
if len(text) < 100:
punct_count = sum(1 for c in text if c in '.,;:!?。、!?')
if punct_count < len(text) * 0.2:
return True
return False
def _extract_body_content(self, soup: BeautifulSoup, full_html: str) -> str:
"""Extract body content while preserving Unicode"""
# Remove script and style elements first
for element in soup(['script', 'style', 'noscript']):
element.decompose()
if soup.body:
return str(soup.body)
else:
return str(soup)
def extract_chapter_content(self, html_content: str, extraction_mode: str = None) -> Tuple[str, str, Optional[str]]:
"""Extract chapter content with proper Unicode and CJK handling"""
try:
# Use instance filtering_mode if not overridden
if extraction_mode is None:
extraction_mode = self.filtering_mode
# Handle encoding if content is bytes
if isinstance(html_content, bytes):
encoding = self._detect_encoding(html_content)
html_content = html_content.decode(encoding, errors='replace')
# Pre-process HTML to protect quotes
html_content = self._preprocess_html_for_quotes(html_content)
# Pre-process HTML to decode all entities
html_content = self._decode_entities(html_content)
# Detect language early
self.detected_language = self._detect_content_language(html_content)
print(f"🌐 Detected language: {self.detected_language}")
# Parse with BeautifulSoup
parser = 'html.parser'
if self.detected_language in ['korean', 'japanese', 'chinese']:
# For CJK content, lxml might handle encoding better if available
try:
import lxml
parser = 'lxml'
except ImportError:
pass
soup = BeautifulSoup(html_content, parser)
# Protect quotes before any processing
self._protect_quotes_in_soup(soup)
# Extract title
chapter_title = self._extract_title(soup)
# Respect GUI toggles to exclude headers/titles BEFORE conversion
try:
batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active
ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
if ignore_title_tag and soup.title:
# Remove
"왜 이러는 겁니까? 우리가 무슨 잘못을 했다고!"
"......"
"한 번만 살려주시오! 가족을 지키려면 어쩔 수 없었소!"
"응애! 응애! 응애!"
"미안하구나. 모든 죄는 내가 짊어지고 사마."
''' # Japanese test with quotes '''「こんにちは!これは日本語のテストです。」
彼は言った。「これで全部ですか?」
「はい、そうです」と答えた。
''', # Chinese test with quotes '''"你好!这是中文测试。"
他说:"这就是全部吗?"
"是的,"她回答道。
''', ] extractor = EnhancedTextExtractor() print("=== CJK and Quote Preservation Test ===\n") for i, test_html in enumerate(test_cases, 1): print(f"--- Test Case {i} ---") try: content, _, title = extractor.extract_chapter_content(test_html) print(f"Title: {title}") print(f"Content:\n{content}\n") # Check for quotes preservation quote_checks = [ ('"', 'Western double quotes'), ('「', 'Japanese left bracket'), ('」', 'Japanese right bracket'), ('“', 'Chinese double quote'), ] print("Quote preservation check:") quote_found = False for quote_char, desc in quote_checks: if quote_char in content: print(f" ✓ Found {desc}: {quote_char}") quote_found = True if not quote_found: print(" ❌ No quotes found!") else: print(" ✅ Quotes preserved successfully!") # Check for image tag preservation (html2text now preserves them natively) img_count = content.count('