Spaces:
Running
Running
File size: 13,308 Bytes
457b8fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 |
# txt_processor.py
import os
import re
import json
from typing import List, Tuple, Dict
from bs4 import BeautifulSoup
from chapter_splitter import ChapterSplitter
from decimal import Decimal
import hashlib
class TextFileProcessor:
"""Process plain text files for translation"""
def __init__(self, file_path: str, output_dir: str):
self.file_path = file_path
self.output_dir = output_dir
self.file_base = os.path.splitext(os.path.basename(file_path))[0]
# Initialize chapter splitter
model_name = os.getenv("MODEL", "gpt-3.5-turbo")
self.chapter_splitter = ChapterSplitter(model_name=model_name)
def extract_chapters(self) -> List[Dict]:
"""Extract chapters from text file"""
with open(self.file_path, 'r', encoding='utf-8') as f:
content = f.read()
# First, detect chapters in the content
raw_chapters = self._detect_chapters(content)
# Then, process each chapter for splitting if needed
final_chapters = self._process_chapters_for_splitting(raw_chapters)
print(f"📚 Extracted {len(final_chapters)} total chunks from {len(raw_chapters)} detected chapters")
return final_chapters
def _detect_chapters(self, content: str) -> List[Dict]:
"""Detect chapter boundaries in the text"""
chapters = []
# Chapter detection patterns
chapter_patterns = [
# English patterns
(r'^Chapter\s+(\d+).*$', 'chapter'),
(r'^CHAPTER\s+(\d+).*$', 'chapter'),
(r'^Ch\.\s*(\d+).*$', 'chapter'),
# Numbered sections
(r'^(\d+)\.\s+(.*)$', 'numbered'),
(r'^Part\s+(\d+).*$', 'part'),
# Scene breaks (these don't have numbers)
(r'^\*\s*\*\s*\*.*$', 'break'),
(r'^---+.*$', 'break'),
(r'^===+.*$', 'break'),
]
# Find all chapter markers and their positions
chapter_breaks = []
lines = content.split('\n')
for line_num, line in enumerate(lines):
for pattern, pattern_type in chapter_patterns:
match = re.match(pattern, line.strip())
if match:
chapter_breaks.append({
'line_num': line_num,
'line': line,
'type': pattern_type,
'match': match
})
break
if not chapter_breaks:
# No chapter markers found, treat as single chapter
print(f"No chapter markers found in {self.file_base}, treating as single document")
# FIX: Use "Section 1" instead of filename to avoid number extraction issues
chapters = [{
'num': 1,
'title': 'Section 1', # Changed from self.file_base
'content': content
}]
else:
# Split content by chapter markers
print(f"Found {len(chapter_breaks)} chapter markers in {self.file_base}")
for i, chapter_break in enumerate(chapter_breaks):
# Determine chapter number and title
chapter_num, chapter_title = self._extract_chapter_info(chapter_break, i)
# Get content for this chapter
start_line = chapter_break['line_num'] + 1 # Start after the chapter marker
# Find where this chapter ends
if i < len(chapter_breaks) - 1:
end_line = chapter_breaks[i + 1]['line_num']
else:
end_line = len(lines)
# Extract chapter content
chapter_lines = lines[start_line:end_line]
chapter_content = '\n'.join(chapter_lines).strip()
if chapter_content: # Only add if there's actual content
chapters.append({
'num': chapter_num,
'title': chapter_title,
'content': chapter_content
})
return chapters
def _extract_chapter_info(self, chapter_break: Dict, index: int) -> Tuple[int, str]:
"""Extract chapter number and title from a chapter break"""
if chapter_break['type'] == 'break':
# Scene breaks don't have numbers
chapter_num = index + 1
chapter_title = f"Section {chapter_num}"
else:
# Try to extract number from match
match_groups = chapter_break['match'].groups()
if match_groups and match_groups[0]: # Check if group exists AND is not empty
try:
# Strip whitespace and check if it's a valid number
num_str = match_groups[0].strip()
if num_str: # Only try to convert if not empty
chapter_num = int(num_str)
chapter_title = chapter_break['line'].strip()
else:
# Empty match group, use index
chapter_num = index + 1
chapter_title = chapter_break['line'].strip()
except (ValueError, IndexError):
# Failed to convert to int, use index
chapter_num = index + 1
chapter_title = chapter_break['line'].strip()
else:
# No match groups or empty match
chapter_num = index + 1
chapter_title = chapter_break['line'].strip()
return chapter_num, chapter_title
def _process_chapters_for_splitting(self, raw_chapters: List[Dict]) -> List[Dict]:
"""Process chapters and split them if they exceed token limits"""
final_chapters = []
# Calculate based on OUTPUT token limits
max_output_tokens = int(os.getenv("MAX_OUTPUT_TOKENS", "8192"))
compression_factor = float(os.getenv("COMPRESSION_FACTOR", "0.8"))
safety_margin_output = 500
# Calculate chunk size based on output limit
available_tokens = int((max_output_tokens - safety_margin_output) / compression_factor)
available_tokens = max(available_tokens, 1000)
print(f"📊 Text file chunk size: {available_tokens:,} tokens (based on {max_output_tokens:,} output limit, compression: {compression_factor})")
for chapter_data in raw_chapters:
# Convert chapter content to HTML format
chapter_html = self._text_to_html(chapter_data['content'])
chapter_tokens = self.chapter_splitter.count_tokens(chapter_html)
if chapter_tokens > available_tokens:
# Chapter needs splitting
print(f"Chapter {chapter_data['num']} ({chapter_data['title']}) has {chapter_tokens} tokens, splitting...")
chunks = self.chapter_splitter.split_chapter(chapter_html, available_tokens)
# Add each chunk as a separate chapter
for chunk_html, chunk_idx, total_chunks in chunks:
chunk_title = chapter_data['title']
if total_chunks > 1:
chunk_title = f"{chapter_data['title']} (Part {chunk_idx}/{total_chunks})"
# Create float chapter numbers for chunks: 1.0, 1.1, 1.2, etc.
chunk_num = round(chapter_data['num'] + (chunk_idx - 1) * 0.1, 1)
final_chapters.append({
'num': chunk_num,
'title': chunk_title,
'body': chunk_html,
'filename': f"section_{int(chapter_data['num'])}_part{chunk_idx}.txt", # Changed to avoid using file_base
'content_hash': self._generate_hash(chunk_html),
'file_size': len(chunk_html),
'has_images': False,
'is_chunk': True,
'chunk_info': {
'chunk_idx': chunk_idx,
'total_chunks': total_chunks,
'original_chapter': chapter_data['num']
}
})
else:
# Chapter is small enough, add as-is
final_chapters.append({
'num': chapter_data['num'], # Keep as integer for non-split chapters
'title': chapter_data['title'],
'body': chapter_html,
'filename': f"section_{chapter_data['num']}.txt", # Changed to avoid using file_base
'content_hash': self._generate_hash(chapter_html),
'file_size': len(chapter_html),
'has_images': False,
'is_chunk': False
})
# Ensure we have at least one chapter
if not final_chapters:
# Fallback: create a single chapter with all content
all_content = '\n\n'.join(ch['content'] for ch in raw_chapters if ch.get('content'))
if not all_content and raw_chapters:
all_content = raw_chapters[0].get('content', '')
final_chapters.append({
'num': 1,
'title': 'Section 1', # Changed from self.file_base
'body': self._text_to_html(all_content or 'Empty file'),
'filename': 'section_1.txt', # Changed to avoid using file_base
'content_hash': self._generate_hash(all_content or ''),
'file_size': len(all_content or ''),
'has_images': False,
'is_chunk': False
})
return final_chapters
def _text_to_html(self, text: str) -> str:
"""Convert plain text to HTML format"""
# Escape HTML characters
text = text.replace('&', '&')
text = text.replace('<', '<')
text = text.replace('>', '>')
# Split into paragraphs
paragraphs = text.split('\n\n')
# Wrap each paragraph in <p> tags
html_parts = []
for para in paragraphs:
para = para.strip()
if para:
# Check if it's a chapter heading
if re.match(r'^(Chapter|CHAPTER|Ch\.|Part)\s+\d+', para):
html_parts.append(f'<h1>{para}</h1>')
else:
# Replace single newlines with <br> within paragraphs
para = para.replace('\n', '<br>\n')
html_parts.append(f'<p>{para}</p>')
# Create a simple HTML structure
html = f"""<html>
<head>
<title>{self.file_base}</title>
<meta charset="utf-8"/>
</head>
<body>
{''.join(html_parts)}
</body>
</html>"""
return html
def _generate_hash(self, content: str) -> str:
"""Generate hash for content"""
return hashlib.md5(content.encode('utf-8')).hexdigest()
def save_original_structure(self):
"""Save original text file structure info"""
metadata = {
'source_file': os.path.basename(self.file_path),
'type': 'text',
'encoding': 'utf-8'
}
metadata_path = os.path.join(self.output_dir, 'metadata.json')
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)
def create_output_structure(self, translated_chapters: List[Tuple[str, str]]) -> str:
"""Create output text file from translated chapters"""
# Sort chapters by filename to ensure correct order
sorted_chapters = sorted(translated_chapters, key=lambda x: x[0])
# Combine all content
all_content = []
for filename, content in sorted_chapters:
# Extract text from HTML
soup = BeautifulSoup(content, 'html.parser')
text_content = soup.get_text()
# Add chapter separator if needed
if len(all_content) > 0:
all_content.append('\n\n' + '='*50 + '\n\n')
all_content.append(text_content)
# Create output filename
output_filename = f"{self.file_base}_translated.txt"
output_path = os.path.join(self.output_dir, output_filename)
# Write the translated text
with open(output_path, 'w', encoding='utf-8') as f:
f.write(''.join(all_content))
print(f"✅ Created translated text file: {output_filename}")
return output_path
|