Spaces:

Shirochi
/

Glossarion

Running

App Files Files Community

Glossarion / enhanced_text_extractor.py

Shirochi

Upload 41 files

457b8fd verified about 1 month ago

raw

history blame

24.5 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Enhanced Text Extractor Module with CJK Support
	Provides superior text extraction from HTML with proper Unicode handling
	Optimized for Korean, Japanese, and Chinese content extraction
	"""

	import os
	import re
	import html
	import unicodedata
	from typing import Tuple, Optional
	import chardet

	# BEAUTIFUL SOUP IMPORT MONKEY FIX - Import BeautifulSoup BEFORE html2text
	# This prevents certain parser initialization issues
	try:
	from bs4 import BeautifulSoup
	# Force BeautifulSoup to initialize its parsers
	_ = BeautifulSoup("", 'html.parser')
	except ImportError:
	BeautifulSoup = None
	raise ImportError("BeautifulSoup is required. Install with: pip install beautifulsoup4")

	# Now import html2text AFTER BeautifulSoup
	try:
	import html2text
	except ImportError:
	html2text = None
	raise ImportError("html2text is required. Install with: pip install html2text")


	class EnhancedTextExtractor:
	"""Enhanced text extraction with proper Unicode and CJK handling"""

	# Unicode preservation mappings
	UNICODE_QUOTES = {
	# Western quotes
	'“': '\u201c', # Left double quotation mark
	'”': '\u201d', # Right double quotation mark
	'‘': '\u2018', # Left single quotation mark
	'’': '\u2019', # Right single quotation mark
	'"': '"', # Standard double quote
	''': "'", # Standard apostrophe

	# CJK quotes and punctuation
	'「': '「', # Japanese left corner bracket
	'」': '」', # Japanese right corner bracket
	'『': '『', # Japanese left white corner bracket
	'』': '』', # Japanese right white corner bracket
	'（': '（', # Fullwidth left parenthesis
	'）': '）', # Fullwidth right parenthesis
	'【': '【', # Left black lenticular bracket
	'】': '】', # Right black lenticular bracket
	'《': '《', # Left double angle bracket
	'》': '》', # Right double angle bracket
	'；': '；', # Fullwidth semicolon
	'：': '：', # Fullwidth colon
	'。': '。', # Ideographic full stop
	'？': '？', # Fullwidth question mark
	'！': '！', # Fullwidth exclamation mark
	'、': '、', # Ideographic comma

	# Numeric entities
	'“': '\u201c', # Left double quote (numeric)
	'”': '\u201d', # Right double quote (numeric)
	'‘': '\u2018', # Left single quote (numeric)
	'’': '\u2019', # Right single quote (numeric)

	# Common CJK entities
	'…': '…', # Horizontal ellipsis
	'—': '—', # Em dash
	'–': '–', # En dash
	' ': '\u00A0', # Non-breaking space
	}

	# CJK-specific punctuation to preserve
	CJK_PUNCTUATION = {
	'。', '、', '！', '？', '…', '—', '～', '・',
	'「', '」', '『', '』', '（', '）', '【', '】',
	'《', '》', '〈', '〉', '〔', '〕', '［', '］',
	'：', '；', '"', '"', ''', ''',
	'，', '．', '？', '！', '：', '；',
	'"', '"', '‚', '„', '«', '»',
	}

	# Quote protection markers
	QUOTE_MARKERS = {
	'"': '␥', # Opening double quote marker
	'"': '␦', # Closing double quote marker
	'"': '␦', # Alternative closing quote
	"'": '␣', # Opening single quote marker
	"'": '␤', # Closing single quote marker
	"'": '␤', # Alternative closing quote
	}


	def __init__(self, filtering_mode: str = "smart", preserve_structure: bool = True):
	"""Initialize the enhanced text extractor"""
	if not html2text:
	raise ImportError("html2text is required for enhanced extraction")

	if not BeautifulSoup:
	raise ImportError("BeautifulSoup is required for enhanced extraction")

	self.filtering_mode = filtering_mode
	self.preserve_structure = preserve_structure
	self.h2t = None
	self.detected_language = None

	self._configure_html2text()

	def _detect_encoding(self, content: bytes) -> str:
	"""Detect the encoding of the content"""
	try:
	# Try chardet detection
	detected = chardet.detect(content)
	if detected['confidence'] > 0.7:
	return detected['encoding']
	except Exception:
	pass

	# Try common CJK encodings in order
	for encoding in ['utf-8', 'gb2312', 'gbk', 'gb18030', 'big5', 'shift_jis', 'euc-kr', 'euc-jp']:
	try:
	content.decode(encoding)
	return encoding
	except Exception:
	continue

	return 'utf-8' # Default fallback

	def _detect_content_language(self, text: str) -> str:
	"""Detect the primary language of content"""
	if not text:
	return 'unknown'

	# Take a sample of the text
	sample = text[:5000]

	# Count characters by script
	korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF)
	japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF))
	chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF)
	latin_chars = sum(1 for char in sample if 0x0041 <= ord(char) <= 0x007A)

	# Determine primary language
	if korean_chars > 50:
	return 'korean'
	elif japanese_kana > 20:
	return 'japanese'
	elif chinese_chars > 50 and japanese_kana < 10:
	return 'chinese'
	elif latin_chars > 100:
	return 'english'
	else:
	return 'unknown'

	def _configure_html2text(self):
	"""Configure html2text with optimal Unicode and CJK settings"""
	self.h2t = html2text.HTML2Text()

	# Core settings for Unicode preservation
	self.h2t.unicode_snob = True
	self.h2t.escape_snob = True
	self.h2t.use_automatic_links = False

	# Layout settings
	self.h2t.body_width = 0
	self.h2t.single_line_break = False

	# Content filtering
	self.h2t.ignore_links = False
	self.h2t.ignore_images = False
	self.h2t.ignore_anchors = False
	self.h2t.skip_internal_links = False
	self.h2t.ignore_tables = False

	# Image handling - CRITICAL: Force html2text to preserve img tags as HTML
	self.h2t.images_as_html = True # Keep images as <img> tags instead of ![]()
	self.h2t.images_to_alt = False # Don't convert to alt text only
	self.h2t.images_with_size = True # Include width/height attributes

	# Additional settings
	self.h2t.wrap_links = False
	self.h2t.wrap_list_items = False
	self.h2t.protect_links = True

	# Structure preservation settings
	if self.preserve_structure:
	self.h2t.bypass_tables = False
	self.h2t.ignore_emphasis = False
	self.h2t.mark_code = True
	self.h2t.ul_item_mark = '•'
	else:
	self.h2t.bypass_tables = True
	self.h2t.ignore_emphasis = True
	self.h2t.mark_code = False

	def _decode_entities(self, text: str) -> str:
	"""Decode HTML entities to Unicode characters with CJK support"""
	if not text:
	return text

	# First pass: Apply known CJK-aware replacements
	for entity, unicode_char in self.UNICODE_QUOTES.items():
	text = text.replace(entity, unicode_char)

	# Second pass: standard HTML unescape
	text = html.unescape(text)

	# Third pass: handle numeric entities
	def decode_decimal(match):
	try:
	code = int(match.group(1))
	if code < 0x110000:
	return chr(code)
	except Exception:
	pass
	return match.group(0)

	def decode_hex(match):
	try:
	code = int(match.group(1), 16)
	if code < 0x110000:
	return chr(code)
	except Exception:
	pass
	return match.group(0)

	text = re.sub(r'&#(\d+);?', decode_decimal, text)
	text = re.sub(r'&#x([0-9a-fA-F]+);?', decode_hex, text)

	# Fourth pass: handle special CJK entities
	cjk_special_entities = {
	'&lang;': '〈', '&rang;': '〉',
	'&lceil;': '⌈', '&rceil;': '⌉',
	'&lfloor;': '⌊', '&rfloor;': '⌋',
	}

	for entity, char in cjk_special_entities.items():
	text = text.replace(entity, char)

	return text

	def _normalize_unicode(self, text: str) -> str:
	"""Normalize Unicode with CJK awareness"""
	if self.detected_language in ['korean', 'japanese', 'chinese']:
	return text
	else:
	return unicodedata.normalize('NFC', text)

	def _protect_quotes(self, text: str) -> str:
	"""Protect quotes by replacing with special markers"""
	for original, marker in self.QUOTE_MARKERS.items():
	text = text.replace(original, marker)
	return text

	def _restore_quotes(self, text: str) -> str:
	"""Restore quotes from special markers"""
	for original, marker in self.QUOTE_MARKERS.items():
	text = text.replace(marker, original)
	return text



	def _preprocess_html_for_quotes(self, html_content: str) -> str:
	"""Pre-process HTML to protect quotes from conversion"""
	def protect_quotes_in_text(match):
	text = match.group(1)
	return f'>{self._protect_quotes(text)}<'

	# Apply to all text between tags
	html_content = re.sub(r'>([^<]+)<', protect_quotes_in_text, html_content)
	return html_content

	def _protect_quotes_in_soup(self, soup: BeautifulSoup) -> None:
	"""Protect quotes in BeautifulSoup object before processing"""
	for element in soup.find_all(string=True):
	if element.parent.name not in ['script', 'style', 'noscript']:
	original_text = str(element)
	protected_text = self._protect_quotes(original_text)
	element.replace_with(protected_text)

	def _minimal_parser_fix(self, html_content: str) -> str:
	"""Apply minimal fixes only for parser errors"""
	# Fix tags with ="" pattern
	html_content = re.sub(r'<[^>]?=\s""\s[^>]?>', '', html_content)

	# Fix malformed closing tags
	html_content = re.sub(r'</\s+(\w+)>', r'</\1>', html_content)
	html_content = re.sub(r'</\s*>', '', html_content)
	html_content = re.sub(r'<//+(\w+)>', r'</\1>', html_content)

	# Fix orphaned brackets
	html_content = re.sub(r'<(?![a-zA-Z/!?])', '<', html_content)
	html_content = re.sub(r'(?<![a-zA-Z0-9"/])>', '>', html_content)

	# Fix unclosed tags at the end
	if html_content.rstrip().endswith('<'):
	html_content = html_content.rstrip()[:-1]

	# Remove nested opening brackets
	html_content = re.sub(r'<[^>]?<[^>]?>', '', html_content)

	return html_content

	def _clean_text_cjk_aware(self, text: str, preserve_structure: bool) -> str:
	"""Clean extracted text with CJK awareness"""
	if not preserve_structure and self.detected_language not in ['korean', 'japanese', 'chinese']:
	# Only do aggressive cleanup for non-CJK text
	text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
	text = re.sub(r'\\(.?)\\*', r'\1', text)
	text = re.sub(r'\(.?)\*', r'\1', text)
	text = re.sub(r'__(.*?)__', r'\1', text)
	text = re.sub(r'_(.*?)_', r'\1', text)
	text = re.sub(r'\[([^\]]+)\]$[^)]+$', r'\1', text)
	text = re.sub(r'!\[([^\]]*)\]$[^)]+$', '', text)
	text = re.sub(r'`([^`]+)`', r'\1', text)
	text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
	text = re.sub(r'^[-*+]\s+', '', text, flags=re.MULTILINE)
	text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE)
	text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
	text = re.sub(r'^[-_*]{3,}$', '', text, flags=re.MULTILINE)

	# Clean whitespace
	if self.detected_language in ['korean', 'japanese', 'chinese']:
	text = re.sub(r'\n{3,}', '\n\n', text)
	text = re.sub(r'[ ]{3,}', ' ', text)
	else:
	text = re.sub(r'\n{3,}', '\n\n', text)
	text = re.sub(r' {2,}', ' ', text)

	# Remove invisible characters
	invisible_chars = ['\u200b', '\u200c', '\u200d', '\ufeff', '\u2060']
	for char in invisible_chars:
	text = text.replace(char, '')

	return text.strip()

	def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
	"""Extract chapter title from various sources"""
	# Try title tag first
	if soup.title and soup.title.string:
	title = soup.title.string.strip()
	title = self._decode_entities(title)
	return title

	# Try headers in order
	for header_tag in ['h1', 'h2', 'h3', 'h4']:
	headers = soup.find_all(header_tag)
	for header in headers:
	title = header.get_text(strip=True)
	if title:
	title = self._decode_entities(title)
	if self._is_chapter_title(title):
	return title

	return None

	def _is_chapter_title(self, text: str) -> bool:
	"""Check if text looks like a chapter title"""
	if not text or len(text) > 200:
	return False

	# Common chapter patterns
	patterns = [
	r'第.{1,10}[章回話话]',
	r'Chapter\s+\d+',
	r'제\s\d+\s화',
	r'第\d+話',
	r'\d+\s*화',
	r'EP\.?\s*\d+',
	r'Part\s+\d+',
	]

	for pattern in patterns:
	if re.search(pattern, text, re.IGNORECASE):
	return True

	# Check if it's short and doesn't contain too much punctuation
	if len(text) < 100:
	punct_count = sum(1 for c in text if c in '.,;:!?。、！？')
	if punct_count < len(text) * 0.2:
	return True

	return False

	def _extract_body_content(self, soup: BeautifulSoup, full_html: str) -> str:
	"""Extract body content while preserving Unicode"""
	# Remove script and style elements first
	for element in soup(['script', 'style', 'noscript']):
	element.decompose()

	if soup.body:
	return str(soup.body)
	else:
	return str(soup)

	def extract_chapter_content(self, html_content: str, extraction_mode: str = None) -> Tuple[str, str, Optional[str]]:
	"""Extract chapter content with proper Unicode and CJK handling"""
	try:
	# Use instance filtering_mode if not overridden
	if extraction_mode is None:
	extraction_mode = self.filtering_mode

	# Handle encoding if content is bytes
	if isinstance(html_content, bytes):
	encoding = self._detect_encoding(html_content)
	html_content = html_content.decode(encoding, errors='replace')

	# Pre-process HTML to protect quotes
	html_content = self._preprocess_html_for_quotes(html_content)

	# Pre-process HTML to decode all entities
	html_content = self._decode_entities(html_content)

	# Detect language early
	self.detected_language = self._detect_content_language(html_content)
	print(f"🌐 Detected language: {self.detected_language}")

	# Parse with BeautifulSoup
	parser = 'html.parser'
	if self.detected_language in ['korean', 'japanese', 'chinese']:
	# For CJK content, lxml might handle encoding better if available
	try:
	import lxml
	parser = 'lxml'
	except ImportError:
	pass

	soup = BeautifulSoup(html_content, parser)

	# Protect quotes before any processing
	self._protect_quotes_in_soup(soup)

	# Extract title
	chapter_title = self._extract_title(soup)

	# Respect GUI toggles to exclude headers/titles BEFORE conversion
	try:
	batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
	ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active
	ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
	if ignore_title_tag and soup.title:
	# Remove <title> so it isn't included when using full extraction
	soup.title.decompose()
	if ignore_header_tags:
	# Remove visible headers from body prior to conversion
	for tag_name in ['h1', 'h2', 'h3']:
	for hdr in soup.find_all(tag_name):
	hdr.decompose()
	except Exception:
	# Non-fatal – proceed with original soup if anything goes wrong
	pass

	# Determine content to convert (after removals)
	if extraction_mode == "full":
	content_to_convert = str(soup)
	else:
	content_to_convert = self._extract_body_content(soup, html_content)

	# Convert using html2text
	content_to_convert = self._decode_entities(content_to_convert)

	# Convert to text with error handling
	try:
	clean_text = self.h2t.handle(content_to_convert)
	except (AssertionError, UnboundLocalError) as e:
	error_msg = str(e)
	if "cannot access local variable" in error_msg or "we should not get here!" in error_msg or "unexpected call to parse_endtag" in error_msg or "unexpected call to parse_starttag" in error_msg:
	print(f"⚠️ html2text encountered malformed HTML: {error_msg}")
	print(f"⚠️ Applying minimal fixes...")
	# Apply minimal fixes
	content_to_convert = self._minimal_parser_fix(content_to_convert)
	try:
	clean_text = self.h2t.handle(content_to_convert)
	print(f"✅ Successfully processed after minimal fixes")
	except Exception as e2:
	print(f"⚠️ html2text still failing: {e2}")
	# Last resort fallback
	clean_text = soup.get_text(separator='\n', strip=True)
	print(f"✅ Used BeautifulSoup fallback")
	else:
	# Re-raise if it's a different error
	raise
	except Exception as e:
	print(f"⚠️ Unexpected error in html2text: {e}")
	# Fallback to BeautifulSoup
	clean_text = soup.get_text(separator='\n', strip=True)

	# Normalize only if appropriate
	clean_text = self._normalize_unicode(clean_text)

	# Clean based on settings and language
	clean_text = self._clean_text_cjk_aware(clean_text, self.preserve_structure)

	# Restore protected quotes
	clean_text = self._restore_quotes(clean_text)

	# For enhanced mode, both display and translation content are the same
	return clean_text, clean_text, chapter_title

	except Exception as e:
	print(f"❌ Enhanced extraction failed: {e}")
	raise


	# Test function
	def test_cjk_preservation():
	"""Test that CJK characters and quotes are properly preserved"""
	test_cases = [
	# Korean test with quotes
	'''<html>
	<head><title>제국의 붉은 사신</title></head>
	<body>
	<p>"왜 이러는 겁니까? 우리가 무슨 잘못을 했다고!"</p>
	<p>"......"</p>
	<p>"한 번만 살려주시오! 가족을 지키려면 어쩔 수 없었소!"</p>
	<p>"응애! 응애! 응애!"</p>
	<p>"미안하구나. 모든 죄는 내가 짊어지고 사마."</p>
	</body>
	</html>'''

	# Japanese test with quotes
	'''<html>
	<head><title>第1話：始まり</title></head>
	<body>
	<h1>第1話：始まり</h1>
	<p>「こんにちは！これは日本語のテストです。」</p>
	<p>彼は言った。「これで全部ですか？」</p>
	<p>「はい、そうです」と答えた。</p>
	</body>
	</html>''',

	# Chinese test with quotes
	'''<html>
	<head><title>第一章：开始</title></head>
	<body>
	<h1>第一章：开始</h1>
	<p>"你好！这是中文测试。"</p>
	<p>他说："这就是全部吗？"</p>
	<p>"是的，"她回答道。</p>
	</body>
	</html>''',
	]

	extractor = EnhancedTextExtractor()

	print("=== CJK and Quote Preservation Test ===\n")

	for i, test_html in enumerate(test_cases, 1):
	print(f"--- Test Case {i} ---")
	try:
	content, _, title = extractor.extract_chapter_content(test_html)

	print(f"Title: {title}")
	print(f"Content:\n{content}\n")

	# Check for quotes preservation
	quote_checks = [
	('"', 'Western double quotes'),
	('「', 'Japanese left bracket'),
	('」', 'Japanese right bracket'),
	('“', 'Chinese double quote'),
	]

	print("Quote preservation check:")
	quote_found = False

	for quote_char, desc in quote_checks:
	if quote_char in content:
	print(f" ✓ Found {desc}: {quote_char}")
	quote_found = True

	if not quote_found:
	print(" ❌ No quotes found!")
	else:
	print(" ✅ Quotes preserved successfully!")

	# Check for image tag preservation (html2text now preserves them natively)
	img_count = content.count('<img')
	if img_count > 0:
	print(f" ✓ Found {img_count} HTML img tags (preserved natively by html2text)")
	print(" ✅ Image tags preserved successfully!")
	else:
	print(" ℹ️ No images in this test case")

	except Exception as e:
	print(f"Error processing test case {i}: {e}")

	print("-" * 50 + "\n")


	if __name__ == "__main__":
	test_cjk_preservation()