Spaces:
Running
Running
File size: 2,611 Bytes
457b8fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
# extract_glossary_from_txt.py
import os
import json
from typing import List
from txt_processor import TextFileProcessor
from chapter_splitter import ChapterSplitter
from bs4 import BeautifulSoup
def extract_chapters_from_txt(txt_path: str) -> List[str]:
"""Extract chapters from text file for glossary extraction"""
processor = TextFileProcessor(txt_path, os.path.dirname(txt_path))
chapters = processor.extract_chapters()
# Initialize chapter splitter
model_name = os.getenv("MODEL", "gpt-3.5-turbo")
chapter_splitter = ChapterSplitter(model_name=model_name)
# Get max tokens from environment
max_input_tokens_str = os.getenv("MAX_INPUT_TOKENS", "1000000").strip()
if not max_input_tokens_str or max_input_tokens_str == "":
# Token limit disabled - use a very large number
max_input_tokens = 10000000 # 10M tokens
else:
max_input_tokens = int(max_input_tokens_str)
# Calculate available tokens (leaving room for system prompt and context)
system_prompt_size = 2000 # Estimate for glossary system prompt
context_size = 5000 # Estimate for context history
safety_margin = 1000
available_tokens = max_input_tokens - system_prompt_size - context_size - safety_margin
text_chapters = []
for idx, chapter in enumerate(chapters):
# Check if chapter needs splitting
chapter_tokens = chapter_splitter.count_tokens(chapter['body'])
if chapter_tokens > available_tokens:
print(f"Chapter {idx+1} has {chapter_tokens} tokens, splitting into smaller chunks...")
# Use ChapterSplitter to split the HTML content
chunks = chapter_splitter.split_chapter(chapter['body'], available_tokens)
# Extract text from each chunk
for chunk_html, chunk_idx, total_chunks in chunks:
soup = BeautifulSoup(chunk_html, 'html.parser')
text = soup.get_text(strip=True)
if text:
text_chapters.append(text)
print(f" Added chunk {chunk_idx}/{total_chunks} ({chapter_splitter.count_tokens(text)} tokens)")
else:
# Chapter is small enough, extract text as-is
soup = BeautifulSoup(chapter['body'], 'html.parser')
text = soup.get_text(strip=True)
if text:
text_chapters.append(text)
print(f"Total text chunks for glossary extraction: {len(text_chapters)}")
return text_chapters
|