Spaces:
Sleeping
Sleeping
File size: 10,682 Bytes
21446aa aa55081 21446aa 830acbf 21446aa 830acbf 21446aa 830acbf 21446aa aa55081 21446aa 830acbf 21446aa 830acbf 21446aa aa55081 21446aa aa55081 21446aa 830acbf 21446aa aa55081 21446aa aa55081 21446aa 830acbf 21446aa 830acbf 21446aa 830acbf 21446aa 830acbf 21446aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 |
import re
import logging
from typing import List, Dict, Tuple
from .llama import NVIDIALLamaClient
logger = logging.getLogger(__name__)
class TextSummarizer:
def __init__(self):
try:
self.llama_client = NVIDIALLamaClient()
except Exception as e:
logger.warning(f"Failed to initialize NVIDIA Llama client: {e}")
self.llama_client = None
def clean_text(self, text: str) -> str:
"""Clean and normalize text for summarization"""
if not text:
return ""
# Remove common conversation starters and fillers
conversation_patterns = [
r'\b(hi|hello|hey|sure|okay|yes|no|thanks|thank you)\b',
r'\b(here is|this is|let me|i will|i can|i would)\b',
r'\b(summarize|summary|here\'s|here is)\b',
r'\b(please|kindly|would you|could you)\b',
r'\b(um|uh|er|ah|well|so|like|you know)\b'
]
# Remove excessive whitespace and normalize
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'\n+', ' ', text)
# Remove conversation patterns
for pattern in conversation_patterns:
text = re.sub(pattern, '', text, flags=re.IGNORECASE)
# Remove extra punctuation and normalize
text = re.sub(r'[.]{2,}', '.', text)
text = re.sub(r'[!]{2,}', '!', text)
text = re.sub(r'[?]{2,}', '?', text)
return text.strip()
def extract_key_phrases(self, text: str) -> List[str]:
"""Extract key cooking phrases and terms"""
if not text:
return []
# Cooking term patterns
cooking_patterns = [
r'\b(?:recipe|ingredients?|cooking|baking|roasting|grilling|frying|boiling|steaming)\b',
r'\b(?:chef|cook|kitchen|cuisine|meal|dish|food|taste|flavor)\b',
r'\b(?:temperature|timing|preparation|technique|method|seasoning|spices?|herbs?)\b',
r'\b(?:oven|stovetop|grill|pan|pot|skillet|knife|cutting|chopping)\b',
r'\b(?:sauce|marinade|dressing|garnish|presentation|serving)\b'
]
key_phrases = []
for pattern in cooking_patterns:
matches = re.findall(pattern, text, re.IGNORECASE)
key_phrases.extend(matches)
return list(set(key_phrases)) # Remove duplicates
def summarize_text(self, text: str, max_length: int = 200) -> str:
"""Summarize text using NVIDIA Llama model"""
if not self.llama_client:
return self._summarize_fallback(text, max_length)
try:
if not text or len(text.strip()) < 50:
return text
# Clean the text first
cleaned_text = self.clean_text(text)
# Extract key phrases for context
key_phrases = self.extract_key_phrases(cleaned_text)
key_phrases_str = ", ".join(key_phrases[:5]) if key_phrases else "cooking information"
# Create optimized prompt
prompt = f"""Summarize this cooking text in {max_length} characters or less. Focus only on key cooking facts, recipes, techniques, and ingredients. Do not include greetings, confirmations, or conversational elements.
Key terms: {key_phrases_str}
Text: {cleaned_text[:1500]}
Summary:"""
summary = self.llama_client._call_llama(prompt)
# Post-process summary
summary = self.clean_text(summary)
# Ensure it's within length limit
if len(summary) > max_length:
summary = summary[:max_length-3] + "..."
return summary
except Exception as e:
logger.error(f"Summarization failed: {e}")
return self._summarize_fallback(text, max_length)
def _summarize_fallback(self, text: str, max_length: int = 200) -> str:
"""Fallback summarization when NVIDIA API is not available"""
if not text:
return ""
cleaned_text = self.clean_text(text)
if len(cleaned_text) <= max_length:
return cleaned_text
# Simple truncation with sentence boundary detection
sentences = cleaned_text.split('. ')
result = ""
for sentence in sentences:
if len(result + sentence) > max_length:
break
result += sentence + ". "
return result.strip() or cleaned_text[:max_length] + "..."
def summarize_for_query(self, text: str, query: str, max_length: int = 220) -> str:
"""Summarize text focusing strictly on information relevant to the query.
Returns an empty string if nothing relevant is found.
"""
if not self.llama_client:
return self._summarize_for_query_fallback(text, query, max_length)
try:
if not text:
return ""
cleaned_text = self.clean_text(text)
if not cleaned_text:
return ""
# Short, strict prompt to avoid verbosity; instruct to output NOTHING if irrelevant
prompt = (
f"You extract only cooking relevant facts that help answer: '{query}'. "
f"Respond with a concise bullet list (<= {max_length} chars total). "
"If the content is irrelevant, respond with EXACTLY: NONE.\n\n"
f"Content: {cleaned_text[:1600]}\n\nRelevant facts:"
)
summary = self.llama_client._call_llama(prompt)
summary = self.clean_text(summary)
if not summary or summary.upper().strip() == "NONE":
return ""
if len(summary) > max_length:
summary = summary[:max_length-3] + "..."
return summary
except Exception as e:
logger.warning(f"Query-focused summarization failed: {e}")
return self._summarize_for_query_fallback(text, query, max_length)
def _summarize_for_query_fallback(self, text: str, query: str, max_length: int = 220) -> str:
"""Fallback query-focused summarization when NVIDIA API is not available"""
if not text:
return ""
cleaned_text = self.clean_text(text)
if not cleaned_text:
return ""
# Simple keyword matching for relevance
query_words = set(query.lower().split())
text_words = set(cleaned_text.lower().split())
# Check if there's any overlap
overlap = query_words.intersection(text_words)
if not overlap:
return ""
# Return first few sentences that contain query words
sentences = cleaned_text.split('. ')
relevant_sentences = []
for sentence in sentences:
sentence_words = set(sentence.lower().split())
if query_words.intersection(sentence_words):
relevant_sentences.append(sentence)
if len('. '.join(relevant_sentences)) > max_length:
break
result = '. '.join(relevant_sentences)
if len(result) > max_length:
result = result[:max_length-3] + "..."
return result
def summarize_documents(self, documents: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
"""Summarize multiple documents with URL mapping"""
try:
doc_summaries = []
url_mapping = {}
for doc in documents:
doc_id = doc['id']
url_mapping[doc_id] = doc['url']
# Create focused summary for each document
summary_prompt = f"""Summarize this cooking document in 2-3 sentences, focusing on information relevant to: "{user_query}"
Document: {doc['title']}
Content: {doc['content'][:800]}
Key cooking information:"""
summary = self.llama_client._call_llama(summary_prompt)
summary = self.clean_text(summary)
doc_summaries.append(f"Document {doc_id}: {summary}")
combined_summary = "\n\n".join(doc_summaries)
return combined_summary, url_mapping
except Exception as e:
logger.error(f"Document summarization failed: {e}")
return "", {}
def summarize_conversation_chunk(self, chunk: str) -> str:
"""Summarize a conversation chunk for memory"""
try:
if not chunk or len(chunk.strip()) < 30:
return chunk
cleaned_chunk = self.clean_text(chunk)
prompt = f"""Summarize this cooking conversation in 1-2 sentences. Focus only on cooking facts, recipes, techniques, or ingredients discussed. Remove greetings and conversational elements.
Conversation: {cleaned_chunk[:1000]}
Cooking summary:"""
summary = self.llama_client._call_llama(prompt)
return self.clean_text(summary)
except Exception as e:
logger.error(f"Conversation summarization failed: {e}")
return self.clean_text(chunk)[:150]
def chunk_response(self, response: str, max_chunk_size: int = 500) -> List[str]:
"""Split response into chunks and summarize each"""
try:
if not response or len(response) <= max_chunk_size:
return [response]
# Split by sentences first
sentences = re.split(r'[.!?]+', response)
chunks = []
current_chunk = ""
for sentence in sentences:
sentence = sentence.strip()
if not sentence:
continue
# Check if adding this sentence would exceed limit
if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
chunks.append(self.summarize_conversation_chunk(current_chunk))
current_chunk = sentence
else:
current_chunk += sentence + ". "
# Add the last chunk
if current_chunk:
chunks.append(self.summarize_conversation_chunk(current_chunk))
return chunks
except Exception as e:
logger.error(f"Response chunking failed: {e}")
return [response]
# Global summarizer instance
summarizer = TextSummarizer()
|