File size: 8,325 Bytes
21446aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
830acbf
21446aa
 
 
830acbf
 
 
 
 
 
 
21446aa
 
 
830acbf
21446aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
830acbf
21446aa
 
830acbf
21446aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
830acbf
21446aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
830acbf
21446aa
 
 
 
830acbf
21446aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
830acbf
21446aa
 
 
830acbf
21446aa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import re
import logging
from typing import List, Dict, Tuple
from .llama import NVIDIALLamaClient

logger = logging.getLogger(__name__)

class TextSummarizer:
    def __init__(self):
        self.llama_client = NVIDIALLamaClient()
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize text for summarization"""
        if not text:
            return ""
        
        # Remove common conversation starters and fillers
        conversation_patterns = [
            r'\b(hi|hello|hey|sure|okay|yes|no|thanks|thank you)\b',
            r'\b(here is|this is|let me|i will|i can|i would)\b',
            r'\b(summarize|summary|here\'s|here is)\b',
            r'\b(please|kindly|would you|could you)\b',
            r'\b(um|uh|er|ah|well|so|like|you know)\b'
        ]
        
        # Remove excessive whitespace and normalize
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'\n+', ' ', text)
        
        # Remove conversation patterns
        for pattern in conversation_patterns:
            text = re.sub(pattern, '', text, flags=re.IGNORECASE)
        
        # Remove extra punctuation and normalize
        text = re.sub(r'[.]{2,}', '.', text)
        text = re.sub(r'[!]{2,}', '!', text)
        text = re.sub(r'[?]{2,}', '?', text)
        
        return text.strip()
    
    def extract_key_phrases(self, text: str) -> List[str]:
        """Extract key cooking phrases and terms"""
        if not text:
            return []
        
        # Cooking term patterns
        cooking_patterns = [
            r'\b(?:recipe|ingredients?|cooking|baking|roasting|grilling|frying|boiling|steaming)\b',
            r'\b(?:chef|cook|kitchen|cuisine|meal|dish|food|taste|flavor)\b',
            r'\b(?:temperature|timing|preparation|technique|method|seasoning|spices?|herbs?)\b',
            r'\b(?:oven|stovetop|grill|pan|pot|skillet|knife|cutting|chopping)\b',
            r'\b(?:sauce|marinade|dressing|garnish|presentation|serving)\b'
        ]
        
        key_phrases = []
        for pattern in cooking_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            key_phrases.extend(matches)
        
        return list(set(key_phrases))  # Remove duplicates
    
    def summarize_text(self, text: str, max_length: int = 200) -> str:
        """Summarize text using NVIDIA Llama model"""
        try:
            if not text or len(text.strip()) < 50:
                return text
            
            # Clean the text first
            cleaned_text = self.clean_text(text)
            
            # Extract key phrases for context
            key_phrases = self.extract_key_phrases(cleaned_text)
            key_phrases_str = ", ".join(key_phrases[:5]) if key_phrases else "cooking information"
            
            # Create optimized prompt
            prompt = f"""Summarize this cooking text in {max_length} characters or less. Focus only on key cooking facts, recipes, techniques, and ingredients. Do not include greetings, confirmations, or conversational elements.

Key terms: {key_phrases_str}

Text: {cleaned_text[:1500]}

Summary:"""

            summary = self.llama_client._call_llama(prompt)
            
            # Post-process summary
            summary = self.clean_text(summary)
            
            # Ensure it's within length limit
            if len(summary) > max_length:
                summary = summary[:max_length-3] + "..."
            
            return summary
            
        except Exception as e:
            logger.error(f"Summarization failed: {e}")
            # Fallback to simple truncation
            return self.clean_text(text)[:max_length]

    def summarize_for_query(self, text: str, query: str, max_length: int = 220) -> str:
        """Summarize text focusing strictly on information relevant to the query.
        Returns an empty string if nothing relevant is found.
        """
        try:
            if not text:
                return ""
            cleaned_text = self.clean_text(text)
            if not cleaned_text:
                return ""

            # Short, strict prompt to avoid verbosity; instruct to output NOTHING if irrelevant
            prompt = (
                f"You extract only cooking relevant facts that help answer: '{query}'. "
                f"Respond with a concise bullet list (<= {max_length} chars total). "
                "If the content is irrelevant, respond with EXACTLY: NONE.\n\n"
                f"Content: {cleaned_text[:1600]}\n\nRelevant facts:"
            )

            summary = self.llama_client._call_llama(prompt)
            summary = self.clean_text(summary)
            if not summary or summary.upper().strip() == "NONE":
                return ""
            if len(summary) > max_length:
                summary = summary[:max_length-3] + "..."
            return summary
        except Exception as e:
            logger.warning(f"Query-focused summarization failed: {e}")
            return ""
    
    def summarize_documents(self, documents: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
        """Summarize multiple documents with URL mapping"""
        try:
            doc_summaries = []
            url_mapping = {}
            
            for doc in documents:
                doc_id = doc['id']
                url_mapping[doc_id] = doc['url']
                
                # Create focused summary for each document
                summary_prompt = f"""Summarize this cooking document in 2-3 sentences, focusing on information relevant to: "{user_query}"

Document: {doc['title']}
Content: {doc['content'][:800]}

Key cooking information:"""

                summary = self.llama_client._call_llama(summary_prompt)
                summary = self.clean_text(summary)
                
                doc_summaries.append(f"Document {doc_id}: {summary}")
            
            combined_summary = "\n\n".join(doc_summaries)
            return combined_summary, url_mapping
            
        except Exception as e:
            logger.error(f"Document summarization failed: {e}")
            return "", {}
    
    def summarize_conversation_chunk(self, chunk: str) -> str:
        """Summarize a conversation chunk for memory"""
        try:
            if not chunk or len(chunk.strip()) < 30:
                return chunk
            
            cleaned_chunk = self.clean_text(chunk)
            
            prompt = f"""Summarize this cooking conversation in 1-2 sentences. Focus only on cooking facts, recipes, techniques, or ingredients discussed. Remove greetings and conversational elements.

Conversation: {cleaned_chunk[:1000]}

Cooking summary:"""

            summary = self.llama_client._call_llama(prompt)
            return self.clean_text(summary)
            
        except Exception as e:
            logger.error(f"Conversation summarization failed: {e}")
            return self.clean_text(chunk)[:150]
    
    def chunk_response(self, response: str, max_chunk_size: int = 500) -> List[str]:
        """Split response into chunks and summarize each"""
        try:
            if not response or len(response) <= max_chunk_size:
                return [response]
            
            # Split by sentences first
            sentences = re.split(r'[.!?]+', response)
            chunks = []
            current_chunk = ""
            
            for sentence in sentences:
                sentence = sentence.strip()
                if not sentence:
                    continue
                
                # Check if adding this sentence would exceed limit
                if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
                    chunks.append(self.summarize_conversation_chunk(current_chunk))
                    current_chunk = sentence
                else:
                    current_chunk += sentence + ". "
            
            # Add the last chunk
            if current_chunk:
                chunks.append(self.summarize_conversation_chunk(current_chunk))
            
            return chunks
            
        except Exception as e:
            logger.error(f"Response chunking failed: {e}")
            return [response]

# Global summarizer instance
summarizer = TextSummarizer()