Spaces:
Sleeping
Sleeping
File size: 8,325 Bytes
21446aa 830acbf 21446aa 830acbf 21446aa 830acbf 21446aa 830acbf 21446aa 830acbf 21446aa 830acbf 21446aa 830acbf 21446aa 830acbf 21446aa 830acbf 21446aa 830acbf 21446aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
import re
import logging
from typing import List, Dict, Tuple
from .llama import NVIDIALLamaClient
logger = logging.getLogger(__name__)
class TextSummarizer:
def __init__(self):
self.llama_client = NVIDIALLamaClient()
def clean_text(self, text: str) -> str:
"""Clean and normalize text for summarization"""
if not text:
return ""
# Remove common conversation starters and fillers
conversation_patterns = [
r'\b(hi|hello|hey|sure|okay|yes|no|thanks|thank you)\b',
r'\b(here is|this is|let me|i will|i can|i would)\b',
r'\b(summarize|summary|here\'s|here is)\b',
r'\b(please|kindly|would you|could you)\b',
r'\b(um|uh|er|ah|well|so|like|you know)\b'
]
# Remove excessive whitespace and normalize
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\n+', ' ', text)
# Remove conversation patterns
for pattern in conversation_patterns:
text = re.sub(pattern, '', text, flags=re.IGNORECASE)
# Remove extra punctuation and normalize
text = re.sub(r'[.]{2,}', '.', text)
text = re.sub(r'[!]{2,}', '!', text)
text = re.sub(r'[?]{2,}', '?', text)
return text.strip()
def extract_key_phrases(self, text: str) -> List[str]:
"""Extract key cooking phrases and terms"""
if not text:
return []
# Cooking term patterns
cooking_patterns = [
r'\b(?:recipe|ingredients?|cooking|baking|roasting|grilling|frying|boiling|steaming)\b',
r'\b(?:chef|cook|kitchen|cuisine|meal|dish|food|taste|flavor)\b',
r'\b(?:temperature|timing|preparation|technique|method|seasoning|spices?|herbs?)\b',
r'\b(?:oven|stovetop|grill|pan|pot|skillet|knife|cutting|chopping)\b',
r'\b(?:sauce|marinade|dressing|garnish|presentation|serving)\b'
]
key_phrases = []
for pattern in cooking_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
key_phrases.extend(matches)
return list(set(key_phrases)) # Remove duplicates
def summarize_text(self, text: str, max_length: int = 200) -> str:
"""Summarize text using NVIDIA Llama model"""
try:
if not text or len(text.strip()) < 50:
return text
# Clean the text first
cleaned_text = self.clean_text(text)
# Extract key phrases for context
key_phrases = self.extract_key_phrases(cleaned_text)
key_phrases_str = ", ".join(key_phrases[:5]) if key_phrases else "cooking information"
# Create optimized prompt
prompt = f"""Summarize this cooking text in {max_length} characters or less. Focus only on key cooking facts, recipes, techniques, and ingredients. Do not include greetings, confirmations, or conversational elements.
Key terms: {key_phrases_str}
Text: {cleaned_text[:1500]}
Summary:"""
summary = self.llama_client._call_llama(prompt)
# Post-process summary
summary = self.clean_text(summary)
# Ensure it's within length limit
if len(summary) > max_length:
summary = summary[:max_length-3] + "..."
return summary
except Exception as e:
logger.error(f"Summarization failed: {e}")
# Fallback to simple truncation
return self.clean_text(text)[:max_length]
def summarize_for_query(self, text: str, query: str, max_length: int = 220) -> str:
"""Summarize text focusing strictly on information relevant to the query.
Returns an empty string if nothing relevant is found.
"""
try:
if not text:
return ""
cleaned_text = self.clean_text(text)
if not cleaned_text:
return ""
# Short, strict prompt to avoid verbosity; instruct to output NOTHING if irrelevant
prompt = (
f"You extract only cooking relevant facts that help answer: '{query}'. "
f"Respond with a concise bullet list (<= {max_length} chars total). "
"If the content is irrelevant, respond with EXACTLY: NONE.\n\n"
f"Content: {cleaned_text[:1600]}\n\nRelevant facts:"
)
summary = self.llama_client._call_llama(prompt)
summary = self.clean_text(summary)
if not summary or summary.upper().strip() == "NONE":
return ""
if len(summary) > max_length:
summary = summary[:max_length-3] + "..."
return summary
except Exception as e:
logger.warning(f"Query-focused summarization failed: {e}")
return ""
def summarize_documents(self, documents: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
"""Summarize multiple documents with URL mapping"""
try:
doc_summaries = []
url_mapping = {}
for doc in documents:
doc_id = doc['id']
url_mapping[doc_id] = doc['url']
# Create focused summary for each document
summary_prompt = f"""Summarize this cooking document in 2-3 sentences, focusing on information relevant to: "{user_query}"
Document: {doc['title']}
Content: {doc['content'][:800]}
Key cooking information:"""
summary = self.llama_client._call_llama(summary_prompt)
summary = self.clean_text(summary)
doc_summaries.append(f"Document {doc_id}: {summary}")
combined_summary = "\n\n".join(doc_summaries)
return combined_summary, url_mapping
except Exception as e:
logger.error(f"Document summarization failed: {e}")
return "", {}
def summarize_conversation_chunk(self, chunk: str) -> str:
"""Summarize a conversation chunk for memory"""
try:
if not chunk or len(chunk.strip()) < 30:
return chunk
cleaned_chunk = self.clean_text(chunk)
prompt = f"""Summarize this cooking conversation in 1-2 sentences. Focus only on cooking facts, recipes, techniques, or ingredients discussed. Remove greetings and conversational elements.
Conversation: {cleaned_chunk[:1000]}
Cooking summary:"""
summary = self.llama_client._call_llama(prompt)
return self.clean_text(summary)
except Exception as e:
logger.error(f"Conversation summarization failed: {e}")
return self.clean_text(chunk)[:150]
def chunk_response(self, response: str, max_chunk_size: int = 500) -> List[str]:
"""Split response into chunks and summarize each"""
try:
if not response or len(response) <= max_chunk_size:
return [response]
# Split by sentences first
sentences = re.split(r'[.!?]+', response)
chunks = []
current_chunk = ""
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# Check if adding this sentence would exceed limit
if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
chunks.append(self.summarize_conversation_chunk(current_chunk))
current_chunk = sentence
else:
current_chunk += sentence + ". "
# Add the last chunk
if current_chunk:
chunks.append(self.summarize_conversation_chunk(current_chunk))
return chunks
except Exception as e:
logger.error(f"Response chunking failed: {e}")
return [response]
# Global summarizer instance
summarizer = TextSummarizer()
|