# txt_processor.py import os import re import json from typing import List, Tuple, Dict from bs4 import BeautifulSoup from chapter_splitter import ChapterSplitter from decimal import Decimal import hashlib class TextFileProcessor: """Process plain text files for translation""" def __init__(self, file_path: str, output_dir: str): self.file_path = file_path self.output_dir = output_dir self.file_base = os.path.splitext(os.path.basename(file_path))[0] # Initialize chapter splitter model_name = os.getenv("MODEL", "gpt-3.5-turbo") self.chapter_splitter = ChapterSplitter(model_name=model_name) def extract_chapters(self) -> List[Dict]: """Extract chapters from text file""" with open(self.file_path, 'r', encoding='utf-8') as f: content = f.read() # First, detect chapters in the content raw_chapters = self._detect_chapters(content) # Then, process each chapter for splitting if needed final_chapters = self._process_chapters_for_splitting(raw_chapters) print(f"📚 Extracted {len(final_chapters)} total chunks from {len(raw_chapters)} detected chapters") return final_chapters def _detect_chapters(self, content: str) -> List[Dict]: """Detect chapter boundaries in the text""" chapters = [] # Chapter detection patterns chapter_patterns = [ # English patterns (r'^Chapter\s+(\d+).*$', 'chapter'), (r'^CHAPTER\s+(\d+).*$', 'chapter'), (r'^Ch\.\s*(\d+).*$', 'chapter'), # Numbered sections (r'^(\d+)\.\s+(.*)$', 'numbered'), (r'^Part\s+(\d+).*$', 'part'), # Scene breaks (these don't have numbers) (r'^\*\s*\*\s*\*.*$', 'break'), (r'^---+.*$', 'break'), (r'^===+.*$', 'break'), ] # Find all chapter markers and their positions chapter_breaks = [] lines = content.split('\n') for line_num, line in enumerate(lines): for pattern, pattern_type in chapter_patterns: match = re.match(pattern, line.strip()) if match: chapter_breaks.append({ 'line_num': line_num, 'line': line, 'type': pattern_type, 'match': match }) break if not chapter_breaks: # No chapter markers found, treat as single chapter print(f"No chapter markers found in {self.file_base}, treating as single document") # FIX: Use "Section 1" instead of filename to avoid number extraction issues chapters = [{ 'num': 1, 'title': 'Section 1', # Changed from self.file_base 'content': content }] else: # Split content by chapter markers print(f"Found {len(chapter_breaks)} chapter markers in {self.file_base}") for i, chapter_break in enumerate(chapter_breaks): # Determine chapter number and title chapter_num, chapter_title = self._extract_chapter_info(chapter_break, i) # Get content for this chapter start_line = chapter_break['line_num'] + 1 # Start after the chapter marker # Find where this chapter ends if i < len(chapter_breaks) - 1: end_line = chapter_breaks[i + 1]['line_num'] else: end_line = len(lines) # Extract chapter content chapter_lines = lines[start_line:end_line] chapter_content = '\n'.join(chapter_lines).strip() if chapter_content: # Only add if there's actual content chapters.append({ 'num': chapter_num, 'title': chapter_title, 'content': chapter_content }) return chapters def _extract_chapter_info(self, chapter_break: Dict, index: int) -> Tuple[int, str]: """Extract chapter number and title from a chapter break""" if chapter_break['type'] == 'break': # Scene breaks don't have numbers chapter_num = index + 1 chapter_title = f"Section {chapter_num}" else: # Try to extract number from match match_groups = chapter_break['match'].groups() if match_groups and match_groups[0]: # Check if group exists AND is not empty try: # Strip whitespace and check if it's a valid number num_str = match_groups[0].strip() if num_str: # Only try to convert if not empty chapter_num = int(num_str) chapter_title = chapter_break['line'].strip() else: # Empty match group, use index chapter_num = index + 1 chapter_title = chapter_break['line'].strip() except (ValueError, IndexError): # Failed to convert to int, use index chapter_num = index + 1 chapter_title = chapter_break['line'].strip() else: # No match groups or empty match chapter_num = index + 1 chapter_title = chapter_break['line'].strip() return chapter_num, chapter_title def _process_chapters_for_splitting(self, raw_chapters: List[Dict]) -> List[Dict]: """Process chapters and split them if they exceed token limits""" final_chapters = [] # Calculate based on OUTPUT token limits max_output_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192")) compression_factor = float(os.getenv("COMPRESSION_FACTOR", "0.8")) safety_margin_output = 500 # Calculate chunk size based on output limit available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor) available_tokens = max(available_tokens, 1000) print(f"📊 Text file chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})") for chapter_data in raw_chapters: # Convert chapter content to HTML format chapter_html = self._text_to_html(chapter_data['content']) chapter_tokens = self.chapter_splitter.count_tokens(chapter_html) if chapter_tokens > available_tokens: # Chapter needs splitting print(f"Chapter {chapter_data['num']} ({chapter_data['title']}) has {chapter_tokens} tokens, splitting...") chunks = self.chapter_splitter.split_chapter(chapter_html, available_tokens) # Add each chunk as a separate chapter for chunk_html, chunk_idx, total_chunks in chunks: chunk_title = chapter_data['title'] if total_chunks > 1: chunk_title = f"{chapter_data['title']} (Part {chunk_idx}/{total_chunks})" # Create float chapter numbers for chunks: 1.0, 1.1, 1.2, etc. chunk_num = round(chapter_data['num'] + (chunk_idx - 1) * 0.1, 1) final_chapters.append({ 'num': chunk_num, 'title': chunk_title, 'body': chunk_html, 'filename': f"section_{int(chapter_data['num'])}_part{chunk_idx}.txt", # Changed to avoid using file_base 'content_hash': self._generate_hash(chunk_html), 'file_size': len(chunk_html), 'has_images': False, 'is_chunk': True, 'chunk_info': { 'chunk_idx': chunk_idx, 'total_chunks': total_chunks, 'original_chapter': chapter_data['num'] } }) else: # Chapter is small enough, add as-is final_chapters.append({ 'num': chapter_data['num'], # Keep as integer for non-split chapters 'title': chapter_data['title'], 'body': chapter_html, 'filename': f"section_{chapter_data['num']}.txt", # Changed to avoid using file_base 'content_hash': self._generate_hash(chapter_html), 'file_size': len(chapter_html), 'has_images': False, 'is_chunk': False }) # Ensure we have at least one chapter if not final_chapters: # Fallback: create a single chapter with all content all_content = '\n\n'.join(ch['content'] for ch in raw_chapters if ch.get('content')) if not all_content and raw_chapters: all_content = raw_chapters[0].get('content', '') final_chapters.append({ 'num': 1, 'title': 'Section 1', # Changed from self.file_base 'body': self._text_to_html(all_content or 'Empty file'), 'filename': 'section_1.txt', # Changed to avoid using file_base 'content_hash': self._generate_hash(all_content or ''), 'file_size': len(all_content or ''), 'has_images': False, 'is_chunk': False }) return final_chapters def _text_to_html(self, text: str) -> str: """Convert plain text to HTML format""" # Escape HTML characters text = text.replace('&', '&') text = text.replace('<', '<') text = text.replace('>', '>') # Split into paragraphs paragraphs = text.split('\n\n') # Wrap each paragraph in
tags html_parts = [] for para in paragraphs: para = para.strip() if para: # Check if it's a chapter heading if re.match(r'^(Chapter|CHAPTER|Ch\.|Part)\s+\d+', para): html_parts.append(f'
{para}
') # Create a simple HTML structure html = f"""