Glossarion / txt_processor.py
Shirochi's picture
Upload 41 files
457b8fd verified
# txt_processor.py
import os
import re
import json
from typing import List, Tuple, Dict
from bs4 import BeautifulSoup
from chapter_splitter import ChapterSplitter
from decimal import Decimal
import hashlib
class TextFileProcessor:
"""Process plain text files for translation"""
def __init__(self, file_path: str, output_dir: str):
self.file_path = file_path
self.output_dir = output_dir
self.file_base = os.path.splitext(os.path.basename(file_path))[0]
# Initialize chapter splitter
model_name = os.getenv("MODEL", "gpt-3.5-turbo")
self.chapter_splitter = ChapterSplitter(model_name=model_name)
def extract_chapters(self) -> List[Dict]:
"""Extract chapters from text file"""
with open(self.file_path, 'r', encoding='utf-8') as f:
content = f.read()
# First, detect chapters in the content
raw_chapters = self._detect_chapters(content)
# Then, process each chapter for splitting if needed
final_chapters = self._process_chapters_for_splitting(raw_chapters)
print(f"📚 Extracted {len(final_chapters)} total chunks from {len(raw_chapters)} detected chapters")
return final_chapters
def _detect_chapters(self, content: str) -> List[Dict]:
"""Detect chapter boundaries in the text"""
chapters = []
# Chapter detection patterns
chapter_patterns = [
# English patterns
(r'^Chapter\s+(\d+).*$', 'chapter'),
(r'^CHAPTER\s+(\d+).*$', 'chapter'),
(r'^Ch\.\s*(\d+).*$', 'chapter'),
# Numbered sections
(r'^(\d+)\.\s+(.*)$', 'numbered'),
(r'^Part\s+(\d+).*$', 'part'),
# Scene breaks (these don't have numbers)
(r'^\*\s*\*\s*\*.*$', 'break'),
(r'^---+.*$', 'break'),
(r'^===+.*$', 'break'),
]
# Find all chapter markers and their positions
chapter_breaks = []
lines = content.split('\n')
for line_num, line in enumerate(lines):
for pattern, pattern_type in chapter_patterns:
match = re.match(pattern, line.strip())
if match:
chapter_breaks.append({
'line_num': line_num,
'line': line,
'type': pattern_type,
'match': match
})
break
if not chapter_breaks:
# No chapter markers found, treat as single chapter
print(f"No chapter markers found in {self.file_base}, treating as single document")
# FIX: Use "Section 1" instead of filename to avoid number extraction issues
chapters = [{
'num': 1,
'title': 'Section 1', # Changed from self.file_base
'content': content
}]
else:
# Split content by chapter markers
print(f"Found {len(chapter_breaks)} chapter markers in {self.file_base}")
for i, chapter_break in enumerate(chapter_breaks):
# Determine chapter number and title
chapter_num, chapter_title = self._extract_chapter_info(chapter_break, i)
# Get content for this chapter
start_line = chapter_break['line_num'] + 1 # Start after the chapter marker
# Find where this chapter ends
if i < len(chapter_breaks) - 1:
end_line = chapter_breaks[i + 1]['line_num']
else:
end_line = len(lines)
# Extract chapter content
chapter_lines = lines[start_line:end_line]
chapter_content = '\n'.join(chapter_lines).strip()
if chapter_content: # Only add if there's actual content
chapters.append({
'num': chapter_num,
'title': chapter_title,
'content': chapter_content
})
return chapters
def _extract_chapter_info(self, chapter_break: Dict, index: int) -> Tuple[int, str]:
"""Extract chapter number and title from a chapter break"""
if chapter_break['type'] == 'break':
# Scene breaks don't have numbers
chapter_num = index + 1
chapter_title = f"Section {chapter_num}"
else:
# Try to extract number from match
match_groups = chapter_break['match'].groups()
if match_groups and match_groups[0]: # Check if group exists AND is not empty
try:
# Strip whitespace and check if it's a valid number
num_str = match_groups[0].strip()
if num_str: # Only try to convert if not empty
chapter_num = int(num_str)
chapter_title = chapter_break['line'].strip()
else:
# Empty match group, use index
chapter_num = index + 1
chapter_title = chapter_break['line'].strip()
except (ValueError, IndexError):
# Failed to convert to int, use index
chapter_num = index + 1
chapter_title = chapter_break['line'].strip()
else:
# No match groups or empty match
chapter_num = index + 1
chapter_title = chapter_break['line'].strip()
return chapter_num, chapter_title
def _process_chapters_for_splitting(self, raw_chapters: List[Dict]) -> List[Dict]:
"""Process chapters and split them if they exceed token limits"""
final_chapters = []
# Calculate based on OUTPUT token limits
max_output_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192"))
compression_factor = float(os.getenv("COMPRESSION_FACTOR", "0.8"))
safety_margin_output = 500
# Calculate chunk size based on output limit
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
available_tokens = max(available_tokens, 1000)
print(f"📊 Text file chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})")
for chapter_data in raw_chapters:
# Convert chapter content to HTML format
chapter_html = self._text_to_html(chapter_data['content'])
chapter_tokens = self.chapter_splitter.count_tokens(chapter_html)
if chapter_tokens > available_tokens:
# Chapter needs splitting
print(f"Chapter {chapter_data['num']} ({chapter_data['title']}) has {chapter_tokens} tokens, splitting...")
chunks = self.chapter_splitter.split_chapter(chapter_html, available_tokens)
# Add each chunk as a separate chapter
for chunk_html, chunk_idx, total_chunks in chunks:
chunk_title = chapter_data['title']
if total_chunks > 1:
chunk_title = f"{chapter_data['title']} (Part {chunk_idx}/{total_chunks})"
# Create float chapter numbers for chunks: 1.0, 1.1, 1.2, etc.
chunk_num = round(chapter_data['num'] + (chunk_idx - 1) * 0.1, 1)
final_chapters.append({
'num': chunk_num,
'title': chunk_title,
'body': chunk_html,
'filename': f"section_{int(chapter_data['num'])}_part{chunk_idx}.txt", # Changed to avoid using file_base
'content_hash': self._generate_hash(chunk_html),
'file_size': len(chunk_html),
'has_images': False,
'is_chunk': True,
'chunk_info': {
'chunk_idx': chunk_idx,
'total_chunks': total_chunks,
'original_chapter': chapter_data['num']
}
})
else:
# Chapter is small enough, add as-is
final_chapters.append({
'num': chapter_data['num'], # Keep as integer for non-split chapters
'title': chapter_data['title'],
'body': chapter_html,
'filename': f"section_{chapter_data['num']}.txt", # Changed to avoid using file_base
'content_hash': self._generate_hash(chapter_html),
'file_size': len(chapter_html),
'has_images': False,
'is_chunk': False
})
# Ensure we have at least one chapter
if not final_chapters:
# Fallback: create a single chapter with all content
all_content = '\n\n'.join(ch['content'] for ch in raw_chapters if ch.get('content'))
if not all_content and raw_chapters:
all_content = raw_chapters[0].get('content', '')
final_chapters.append({
'num': 1,
'title': 'Section 1', # Changed from self.file_base
'body': self._text_to_html(all_content or 'Empty file'),
'filename': 'section_1.txt', # Changed to avoid using file_base
'content_hash': self._generate_hash(all_content or ''),
'file_size': len(all_content or ''),
'has_images': False,
'is_chunk': False
})
return final_chapters
def _text_to_html(self, text: str) -> str:
"""Convert plain text to HTML format"""
# Escape HTML characters
text = text.replace('&', '&amp;')
text = text.replace('<', '&lt;')
text = text.replace('>', '&gt;')
# Split into paragraphs
paragraphs = text.split('\n\n')
# Wrap each paragraph in <p> tags
html_parts = []
for para in paragraphs:
para = para.strip()
if para:
# Check if it's a chapter heading
if re.match(r'^(Chapter|CHAPTER|Ch\.|Part)\s+\d+', para):
html_parts.append(f'<h1>{para}</h1>')
else:
# Replace single newlines with <br> within paragraphs
para = para.replace('\n', '<br>\n')
html_parts.append(f'<p>{para}</p>')
# Create a simple HTML structure
html = f"""<html>
<head>
<title>{self.file_base}</title>
<meta charset="utf-8"/>
</head>
<body>
{''.join(html_parts)}
</body>
</html>"""
return html
def _generate_hash(self, content: str) -> str:
"""Generate hash for content"""
return hashlib.md5(content.encode('utf-8')).hexdigest()
def save_original_structure(self):
"""Save original text file structure info"""
metadata = {
'source_file': os.path.basename(self.file_path),
'type': 'text',
'encoding': 'utf-8'
}
metadata_path = os.path.join(self.output_dir, 'metadata.json')
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
def create_output_structure(self, translated_chapters: List[Tuple[str, str]]) -> str:
"""Create output text file from translated chapters"""
# Sort chapters by filename to ensure correct order
sorted_chapters = sorted(translated_chapters, key=lambda x: x[0])
# Combine all content
all_content = []
for filename, content in sorted_chapters:
# Extract text from HTML
soup = BeautifulSoup(content, 'html.parser')
text_content = soup.get_text()
# Add chapter separator if needed
if len(all_content) > 0:
all_content.append('\n\n' + '='*50 + '\n\n')
all_content.append(text_content)
# Create output filename
output_filename = f"{self.file_base}_translated.txt"
output_path = os.path.join(self.output_dir, output_filename)
# Write the translated text
with open(output_path, 'w', encoding='utf-8') as f:
f.write(''.join(all_content))
print(f"✅ Created translated text file: {output_filename}")
return output_path