File size: 24,519 Bytes
457b8fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""

Enhanced Text Extractor Module with CJK Support

Provides superior text extraction from HTML with proper Unicode handling

Optimized for Korean, Japanese, and Chinese content extraction

"""

import os
import re
import html
import unicodedata
from typing import Tuple, Optional
import chardet

# BEAUTIFUL SOUP IMPORT MONKEY FIX - Import BeautifulSoup BEFORE html2text
# This prevents certain parser initialization issues
try:
    from bs4 import BeautifulSoup
    # Force BeautifulSoup to initialize its parsers
    _ = BeautifulSoup("", 'html.parser')
except ImportError:
    BeautifulSoup = None
    raise ImportError("BeautifulSoup is required. Install with: pip install beautifulsoup4")

# Now import html2text AFTER BeautifulSoup
try:
    import html2text
except ImportError:
    html2text = None
    raise ImportError("html2text is required. Install with: pip install html2text")


class EnhancedTextExtractor:
    """Enhanced text extraction with proper Unicode and CJK handling"""
    
    # Unicode preservation mappings
    UNICODE_QUOTES = {
        # Western quotes
        '“': '\u201c',  # Left double quotation mark
        '”': '\u201d',  # Right double quotation mark
        '‘': '\u2018',  # Left single quotation mark
        '’': '\u2019',  # Right single quotation mark
        '"': '"',        # Standard double quote
        ''': "'",        # Standard apostrophe
        
        # CJK quotes and punctuation
        '「': '「',  # Japanese left corner bracket
        '」': '」',  # Japanese right corner bracket
        '『': '『',  # Japanese left white corner bracket
        '』': '』',  # Japanese right white corner bracket
        '(': '(',  # Fullwidth left parenthesis
        ')': ')',  # Fullwidth right parenthesis
        '【': '【',  # Left black lenticular bracket
        '】': '】',  # Right black lenticular bracket
        '《': '《',  # Left double angle bracket
        '》': '》',  # Right double angle bracket
        ';': ';',  # Fullwidth semicolon
        ':': ':',  # Fullwidth colon
        '。': '。',  # Ideographic full stop
        '?': '?',  # Fullwidth question mark
        '!': '!',  # Fullwidth exclamation mark
        '、': '、',  # Ideographic comma
        
        # Numeric entities
        '“': '\u201c',  # Left double quote (numeric)
        '”': '\u201d',  # Right double quote (numeric)
        '‘': '\u2018',  # Left single quote (numeric)
        '’': '\u2019',  # Right single quote (numeric)
        
        # Common CJK entities
        '…': '…',     # Horizontal ellipsis
        '—': '—',      # Em dash
        '–': '–',      # En dash
        ' ': '\u00A0',  # Non-breaking space
    }
    
    # CJK-specific punctuation to preserve
    CJK_PUNCTUATION = {
        '。', '、', '!', '?', '…', '—', '~', '・',
        '「', '」', '『', '』', '(', ')', '【', '】',
        '《', '》', '〈', '〉', '〔', '〕', '[', ']',
        ':', ';', '"', '"', ''', ''',
        ',', '.', '?', '!', ':', ';',
        '"', '"', '‚', '„', '«', '»',
    }
    
    # Quote protection markers
    QUOTE_MARKERS = {
        '"': '␥',   # Opening double quote marker
        '"': '␦',   # Closing double quote marker  
        '"': '␦',   # Alternative closing quote
        "'": '␣',   # Opening single quote marker
        "'": '␤',   # Closing single quote marker
        "'": '␤',   # Alternative closing quote
    }
    
    
    def __init__(self, filtering_mode: str = "smart", preserve_structure: bool = True):
        """Initialize the enhanced text extractor"""
        if not html2text:
            raise ImportError("html2text is required for enhanced extraction")
        
        if not BeautifulSoup:
            raise ImportError("BeautifulSoup is required for enhanced extraction")
            
        self.filtering_mode = filtering_mode
        self.preserve_structure = preserve_structure
        self.h2t = None
        self.detected_language = None
        
        self._configure_html2text()
    
    def _detect_encoding(self, content: bytes) -> str:
        """Detect the encoding of the content"""
        try:
            # Try chardet detection
            detected = chardet.detect(content)
            if detected['confidence'] > 0.7:
                return detected['encoding']
        except Exception:
            pass
        
        # Try common CJK encodings in order
        for encoding in ['utf-8', 'gb2312', 'gbk', 'gb18030', 'big5', 'shift_jis', 'euc-kr', 'euc-jp']:
            try:
                content.decode(encoding)
                return encoding
            except Exception:
                continue
        
        return 'utf-8'  # Default fallback
    
    def _detect_content_language(self, text: str) -> str:
        """Detect the primary language of content"""
        if not text:
            return 'unknown'
        
        # Take a sample of the text
        sample = text[:5000]
        
        # Count characters by script
        korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF)
        japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF))
        chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF)
        latin_chars = sum(1 for char in sample if 0x0041 <= ord(char) <= 0x007A)
        
        # Determine primary language
        if korean_chars > 50:
            return 'korean'
        elif japanese_kana > 20:
            return 'japanese'
        elif chinese_chars > 50 and japanese_kana < 10:
            return 'chinese'
        elif latin_chars > 100:
            return 'english'
        else:
            return 'unknown'
    
    def _configure_html2text(self):
        """Configure html2text with optimal Unicode and CJK settings"""
        self.h2t = html2text.HTML2Text()
        
        # Core settings for Unicode preservation
        self.h2t.unicode_snob = True
        self.h2t.escape_snob = True
        self.h2t.use_automatic_links = False
        
        # Layout settings
        self.h2t.body_width = 0
        self.h2t.single_line_break = False
        
        # Content filtering
        self.h2t.ignore_links = False
        self.h2t.ignore_images = False
        self.h2t.ignore_anchors = False
        self.h2t.skip_internal_links = False
        self.h2t.ignore_tables = False
        
        # Image handling - CRITICAL: Force html2text to preserve img tags as HTML
        self.h2t.images_as_html = True  # Keep images as <img> tags instead of ![]()
        self.h2t.images_to_alt = False  # Don't convert to alt text only
        self.h2t.images_with_size = True  # Include width/height attributes
        
        # Additional settings
        self.h2t.wrap_links = False
        self.h2t.wrap_list_items = False
        self.h2t.protect_links = True
        
        # Structure preservation settings
        if self.preserve_structure:
            self.h2t.bypass_tables = False
            self.h2t.ignore_emphasis = False
            self.h2t.mark_code = True
            self.h2t.ul_item_mark = '•'
        else:
            self.h2t.bypass_tables = True
            self.h2t.ignore_emphasis = True
            self.h2t.mark_code = False
    
    def _decode_entities(self, text: str) -> str:
        """Decode HTML entities to Unicode characters with CJK support"""
        if not text:
            return text
        
        # First pass: Apply known CJK-aware replacements
        for entity, unicode_char in self.UNICODE_QUOTES.items():
            text = text.replace(entity, unicode_char)
        
        # Second pass: standard HTML unescape
        text = html.unescape(text)
        
        # Third pass: handle numeric entities
        def decode_decimal(match):
            try:
                code = int(match.group(1))
                if code < 0x110000:
                    return chr(code)
            except Exception:
                pass
            return match.group(0)
        
        def decode_hex(match):
            try:
                code = int(match.group(1), 16)
                if code < 0x110000:
                    return chr(code)
            except Exception:
                pass
            return match.group(0)
        
        text = re.sub(r'&#(\d+);?', decode_decimal, text)
        text = re.sub(r'&#x([0-9a-fA-F]+);?', decode_hex, text)
        
        # Fourth pass: handle special CJK entities
        cjk_special_entities = {
            '&lang;': '〈', '&rang;': '〉',
            '&lceil;': '⌈', '&rceil;': '⌉',
            '&lfloor;': '⌊', '&rfloor;': '⌋',
        }
        
        for entity, char in cjk_special_entities.items():
            text = text.replace(entity, char)
        
        return text
    
    def _normalize_unicode(self, text: str) -> str:
        """Normalize Unicode with CJK awareness"""
        if self.detected_language in ['korean', 'japanese', 'chinese']:
            return text
        else:
            return unicodedata.normalize('NFC', text)
    
    def _protect_quotes(self, text: str) -> str:
        """Protect quotes by replacing with special markers"""
        for original, marker in self.QUOTE_MARKERS.items():
            text = text.replace(original, marker)
        return text
    
    def _restore_quotes(self, text: str) -> str:
        """Restore quotes from special markers"""
        for original, marker in self.QUOTE_MARKERS.items():
            text = text.replace(marker, original)
        return text
    
    
    
    def _preprocess_html_for_quotes(self, html_content: str) -> str:
        """Pre-process HTML to protect quotes from conversion"""
        def protect_quotes_in_text(match):
            text = match.group(1)
            return f'>{self._protect_quotes(text)}<'
        
        # Apply to all text between tags
        html_content = re.sub(r'>([^<]+)<', protect_quotes_in_text, html_content)
        return html_content
    
    def _protect_quotes_in_soup(self, soup: BeautifulSoup) -> None:
        """Protect quotes in BeautifulSoup object before processing"""
        for element in soup.find_all(string=True):
            if element.parent.name not in ['script', 'style', 'noscript']:
                original_text = str(element)
                protected_text = self._protect_quotes(original_text)
                element.replace_with(protected_text)
    
    def _minimal_parser_fix(self, html_content: str) -> str:
        """Apply minimal fixes only for parser errors"""
        # Fix tags with ="" pattern
        html_content = re.sub(r'<[^>]*?=\s*""\s*[^>]*?>', '', html_content)
        
        # Fix malformed closing tags
        html_content = re.sub(r'</\s+(\w+)>', r'</\1>', html_content)
        html_content = re.sub(r'</\s*>', '', html_content)
        html_content = re.sub(r'<//+(\w+)>', r'</\1>', html_content)
        
        # Fix orphaned brackets
        html_content = re.sub(r'<(?![a-zA-Z/!?])', '&lt;', html_content)
        html_content = re.sub(r'(?<![a-zA-Z0-9"/])>', '&gt;', html_content)
        
        # Fix unclosed tags at the end
        if html_content.rstrip().endswith('<'):
            html_content = html_content.rstrip()[:-1]
        
        # Remove nested opening brackets
        html_content = re.sub(r'<[^>]*?<[^>]*?>', '', html_content)
        
        return html_content
    
    def _clean_text_cjk_aware(self, text: str, preserve_structure: bool) -> str:
        """Clean extracted text with CJK awareness"""
        if not preserve_structure and self.detected_language not in ['korean', 'japanese', 'chinese']:
            # Only do aggressive cleanup for non-CJK text
            text = re.sub(r'^#+\s*', '', text, flags=re.MULTILINE)
            text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)
            text = re.sub(r'\*(.*?)\*', r'\1', text)
            text = re.sub(r'__(.*?)__', r'\1', text)
            text = re.sub(r'_(.*?)_', r'\1', text)
            text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
            text = re.sub(r'!\[([^\]]*)\]\([^)]+\)', '', text)
            text = re.sub(r'`([^`]+)`', r'\1', text)
            text = re.sub(r'```[^`]*```', '', text, flags=re.DOTALL)
            text = re.sub(r'^[-*+]\s+', '', text, flags=re.MULTILINE)
            text = re.sub(r'^\d+\.\s+', '', text, flags=re.MULTILINE)
            text = re.sub(r'^>\s+', '', text, flags=re.MULTILINE)
            text = re.sub(r'^[-_*]{3,}$', '', text, flags=re.MULTILINE)
        
        # Clean whitespace
        if self.detected_language in ['korean', 'japanese', 'chinese']:
            text = re.sub(r'\n{3,}', '\n\n', text)
            text = re.sub(r'[ ]{3,}', '  ', text)
        else:
            text = re.sub(r'\n{3,}', '\n\n', text)
            text = re.sub(r' {2,}', ' ', text)
        
        # Remove invisible characters
        invisible_chars = ['\u200b', '\u200c', '\u200d', '\ufeff', '\u2060']
        for char in invisible_chars:
            text = text.replace(char, '')
        
        return text.strip()
    
    def _extract_title(self, soup: BeautifulSoup) -> Optional[str]:
        """Extract chapter title from various sources"""
        # Try title tag first
        if soup.title and soup.title.string:
            title = soup.title.string.strip()
            title = self._decode_entities(title)
            return title
        
        # Try headers in order
        for header_tag in ['h1', 'h2', 'h3', 'h4']:
            headers = soup.find_all(header_tag)
            for header in headers:
                title = header.get_text(strip=True)
                if title:
                    title = self._decode_entities(title)
                    if self._is_chapter_title(title):
                        return title
        
        return None
    
    def _is_chapter_title(self, text: str) -> bool:
        """Check if text looks like a chapter title"""
        if not text or len(text) > 200:
            return False
        
        # Common chapter patterns
        patterns = [
            r'第.{1,10}[章回話话]',
            r'Chapter\s+\d+',
            r'제\s*\d+\s*화',
            r'第\d+話',
            r'\d+\s*화',
            r'EP\.?\s*\d+',
            r'Part\s+\d+',
        ]
        
        for pattern in patterns:
            if re.search(pattern, text, re.IGNORECASE):
                return True
        
        # Check if it's short and doesn't contain too much punctuation
        if len(text) < 100:
            punct_count = sum(1 for c in text if c in '.,;:!?。、!?')
            if punct_count < len(text) * 0.2:
                return True
        
        return False
    
    def _extract_body_content(self, soup: BeautifulSoup, full_html: str) -> str:
        """Extract body content while preserving Unicode"""
        # Remove script and style elements first
        for element in soup(['script', 'style', 'noscript']):
            element.decompose()
        
        if soup.body:
            return str(soup.body)
        else:
            return str(soup)
    
    def extract_chapter_content(self, html_content: str, extraction_mode: str = None) -> Tuple[str, str, Optional[str]]:
        """Extract chapter content with proper Unicode and CJK handling"""
        try:
            # Use instance filtering_mode if not overridden
            if extraction_mode is None:
                extraction_mode = self.filtering_mode
            
            # Handle encoding if content is bytes
            if isinstance(html_content, bytes):
                encoding = self._detect_encoding(html_content)
                html_content = html_content.decode(encoding, errors='replace')
            
            # Pre-process HTML to protect quotes
            html_content = self._preprocess_html_for_quotes(html_content)
            
            # Pre-process HTML to decode all entities
            html_content = self._decode_entities(html_content)
            
            # Detect language early
            self.detected_language = self._detect_content_language(html_content)
            print(f"🌐 Detected language: {self.detected_language}")
            
            # Parse with BeautifulSoup
            parser = 'html.parser'
            if self.detected_language in ['korean', 'japanese', 'chinese']:
                # For CJK content, lxml might handle encoding better if available
                try:
                    import lxml
                    parser = 'lxml'
                except ImportError:
                    pass
            
            soup = BeautifulSoup(html_content, parser)
            
            # Protect quotes before any processing
            self._protect_quotes_in_soup(soup)
            
            # Extract title
            chapter_title = self._extract_title(soup)
            
            # Respect GUI toggles to exclude headers/titles BEFORE conversion
            try:
                batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1'
                ignore_title_tag = os.getenv('IGNORE_TITLE', '0') == '1' and batch_translate_active
                ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active
                if ignore_title_tag and soup.title:
                    # Remove <title> so it isn't included when using full extraction
                    soup.title.decompose()
                if ignore_header_tags:
                    # Remove visible headers from body prior to conversion
                    for tag_name in ['h1', 'h2', 'h3']:
                        for hdr in soup.find_all(tag_name):
                            hdr.decompose()
            except Exception:
                # Non-fatal – proceed with original soup if anything goes wrong
                pass
            
            # Determine content to convert (after removals)
            if extraction_mode == "full":
                content_to_convert = str(soup)
            else:
                content_to_convert = self._extract_body_content(soup, html_content)
            
            # Convert using html2text
            content_to_convert = self._decode_entities(content_to_convert)
            
            # Convert to text with error handling
            try:
                clean_text = self.h2t.handle(content_to_convert)
            except (AssertionError, UnboundLocalError) as e:
                error_msg = str(e)
                if "cannot access local variable" in error_msg or "we should not get here!" in error_msg or "unexpected call to parse_endtag" in error_msg or "unexpected call to parse_starttag" in error_msg:
                    print(f"⚠️ html2text encountered malformed HTML: {error_msg}")
                    print(f"⚠️ Applying minimal fixes...")
                    # Apply minimal fixes
                    content_to_convert = self._minimal_parser_fix(content_to_convert)
                    try:
                        clean_text = self.h2t.handle(content_to_convert)
                        print(f"✅ Successfully processed after minimal fixes")
                    except Exception as e2:
                        print(f"⚠️ html2text still failing: {e2}")
                        # Last resort fallback
                        clean_text = soup.get_text(separator='\n', strip=True)
                        print(f"✅ Used BeautifulSoup fallback")
                else:
                    # Re-raise if it's a different error
                    raise
            except Exception as e:
                print(f"⚠️ Unexpected error in html2text: {e}")
                # Fallback to BeautifulSoup
                clean_text = soup.get_text(separator='\n', strip=True)
            
            # Normalize only if appropriate
            clean_text = self._normalize_unicode(clean_text)
            
            # Clean based on settings and language
            clean_text = self._clean_text_cjk_aware(clean_text, self.preserve_structure)
            
            # Restore protected quotes
            clean_text = self._restore_quotes(clean_text)
            
            # For enhanced mode, both display and translation content are the same
            return clean_text, clean_text, chapter_title
                
        except Exception as e:
            print(f"❌ Enhanced extraction failed: {e}")
            raise


# Test function
def test_cjk_preservation():
    """Test that CJK characters and quotes are properly preserved"""
    test_cases = [
        # Korean test with quotes
        '''<html>

        <head><title>제국의 붉은 사신</title></head>

        <body>

            <p>"왜 이러는 겁니까? 우리가 무슨 잘못을 했다고!"</p>

            <p>"......"</p>

            <p>"한 번만 살려주시오! 가족을 지키려면 어쩔 수 없었소!"</p>

            <p>"응애! 응애! 응애!"</p>

            <p>"미안하구나. 모든 죄는 내가 짊어지고 사마."</p>

        </body>

        </html>'''
        
        # Japanese test with quotes
        '''<html>

        <head><title>第1話:始まり</title></head>

        <body>

            <h1>第1話:始まり</h1>

            <p>「こんにちは!これは日本語のテストです。」</p>

            <p>彼は言った。「これで全部ですか?」</p>

            <p>「はい、そうです」と答えた。</p>

        </body>

        </html>''',
        
        # Chinese test with quotes
        '''<html>

        <head><title>第一章:开始</title></head>

        <body>

            <h1>第一章:开始</h1>

            <p>"你好!这是中文测试。"</p>

            <p>他说:"这就是全部吗?"</p>

            <p>"是的,"她回答道。</p>

        </body>

        </html>''',
    ]
    
    extractor = EnhancedTextExtractor()
    
    print("=== CJK and Quote Preservation Test ===\n")
    
    for i, test_html in enumerate(test_cases, 1):
        print(f"--- Test Case {i} ---")
        try:
            content, _, title = extractor.extract_chapter_content(test_html)
            
            print(f"Title: {title}")
            print(f"Content:\n{content}\n")
            
            # Check for quotes preservation
            quote_checks = [
                ('"', 'Western double quotes'),
                ('「', 'Japanese left bracket'),
                ('」', 'Japanese right bracket'),
                ('“', 'Chinese double quote'),
            ]
            
            print("Quote preservation check:")
            quote_found = False
            
            for quote_char, desc in quote_checks:
                if quote_char in content:
                    print(f"  ✓ Found {desc}: {quote_char}")
                    quote_found = True
            
            if not quote_found:
                print("  ❌ No quotes found!")
            else:
                print("  ✅ Quotes preserved successfully!")
            
            # Check for image tag preservation (html2text now preserves them natively)
            img_count = content.count('<img')
            if img_count > 0:
                print(f"  ✓ Found {img_count} HTML img tags (preserved natively by html2text)")
                print("  ✅ Image tags preserved successfully!")
            else:
                print("  ℹ️ No images in this test case")
                
        except Exception as e:
            print(f"Error processing test case {i}: {e}")
        
        print("-" * 50 + "\n")


if __name__ == "__main__":
    test_cjk_preservation()