Glossarion / extract_glossary_from_txt.py
Shirochi's picture
Upload 41 files
457b8fd verified
raw
history blame
2.61 kB
# extract_glossary_from_txt.py
import os
import json
from typing import List
from txt_processor import TextFileProcessor
from chapter_splitter import ChapterSplitter
from bs4 import BeautifulSoup
def extract_chapters_from_txt(txt_path: str) -> List[str]:
"""Extract chapters from text file for glossary extraction"""
processor = TextFileProcessor(txt_path, os.path.dirname(txt_path))
chapters = processor.extract_chapters()
# Initialize chapter splitter
model_name = os.getenv("MODEL", "gpt-3.5-turbo")
chapter_splitter = ChapterSplitter(model_name=model_name)
# Get max tokens from environment
max_input_tokens_str = os.getenv("MAX_INPUT_TOKENS", "1000000").strip()
if not max_input_tokens_str or max_input_tokens_str == "":
# Token limit disabled - use a very large number
max_input_tokens = 10000000 # 10M tokens
else:
max_input_tokens = int(max_input_tokens_str)
# Calculate available tokens (leaving room for system prompt and context)
system_prompt_size = 2000 # Estimate for glossary system prompt
context_size = 5000 # Estimate for context history
safety_margin = 1000
available_tokens = max_input_tokens - system_prompt_size - context_size - safety_margin
text_chapters = []
for idx, chapter in enumerate(chapters):
# Check if chapter needs splitting
chapter_tokens = chapter_splitter.count_tokens(chapter['body'])
if chapter_tokens > available_tokens:
print(f"Chapter {idx+1} has {chapter_tokens} tokens, splitting into smaller chunks...")
# Use ChapterSplitter to split the HTML content
chunks = chapter_splitter.split_chapter(chapter['body'], available_tokens)
# Extract text from each chunk
for chunk_html, chunk_idx, total_chunks in chunks:
soup = BeautifulSoup(chunk_html, 'html.parser')
text = soup.get_text(strip=True)
if text:
text_chapters.append(text)
print(f" Added chunk {chunk_idx}/{total_chunks} ({chapter_splitter.count_tokens(text)} tokens)")
else:
# Chapter is small enough, extract text as-is
soup = BeautifulSoup(chapter['body'], 'html.parser')
text = soup.get_text(strip=True)
if text:
text_chapters.append(text)
print(f"Total text chunks for glossary extraction: {len(text_chapters)}")
return text_chapters