Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import re | |
| import json | |
| import tempfile | |
| import hashlib | |
| from pathlib import Path | |
| from datetime import datetime | |
| from typing import Dict, List, Tuple, Optional, Union | |
| import logging | |
| # Configure logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Optional imports for document processing | |
| try: | |
| from docx import Document | |
| DOCX_AVAILABLE = True | |
| except ImportError: | |
| DOCX_AVAILABLE = False | |
| logger.warning("python-docx not installed. DOCX processing will be disabled.") | |
| try: | |
| import PyPDF2 | |
| PDF_AVAILABLE = True | |
| except ImportError: | |
| PDF_AVAILABLE = False | |
| logger.warning("PyPDF2 not installed. PDF processing will be disabled.") | |
| try: | |
| import fitz # PyMuPDF - alternative PDF processor | |
| PYMUPDF_AVAILABLE = True | |
| except ImportError: | |
| PYMUPDF_AVAILABLE = False | |
| # Optional imports for advanced text processing | |
| try: | |
| import nltk | |
| from nltk.tokenize import sent_tokenize, word_tokenize | |
| from nltk.corpus import stopwords | |
| from nltk.frequency import FreqDist | |
| from nltk.sentiment import SentimentIntensityAnalyzer | |
| NLTK_AVAILABLE = True | |
| # Download required NLTK data | |
| required_nltk_data = ['punkt', 'stopwords', 'vader_lexicon'] | |
| for data_name in required_nltk_data: | |
| try: | |
| if data_name == 'punkt': | |
| nltk.data.find('tokenizers/punkt') | |
| elif data_name == 'stopwords': | |
| nltk.data.find('corpora/stopwords') | |
| elif data_name == 'vader_lexicon': | |
| nltk.data.find('vader_lexicon') | |
| except LookupError: | |
| nltk.download(data_name, quiet=True) | |
| except ImportError: | |
| NLTK_AVAILABLE = False | |
| logger.warning("NLTK not installed. Advanced text analysis will be limited.") | |
| try: | |
| from transformers import pipeline | |
| import torch | |
| TRANSFORMERS_AVAILABLE = True | |
| DEVICE = 0 if torch.cuda.is_available() else -1 | |
| except ImportError: | |
| TRANSFORMERS_AVAILABLE = False | |
| DEVICE = -1 | |
| logger.warning("transformers not installed. AI summarization will use basic extraction methods.") | |
| class AdvancedDocumentSummarizer: | |
| """CatalystGPT-4 Advanced Document Summarizer with enhanced features""" | |
| def __init__(self): | |
| self.summarizer = None | |
| self.sentiment_analyzer = None | |
| self.cache = {} | |
| # Initialize AI models | |
| if TRANSFORMERS_AVAILABLE: | |
| self._initialize_ai_models() | |
| # Initialize sentiment analyzer | |
| if NLTK_AVAILABLE: | |
| try: | |
| self.sentiment_analyzer = SentimentIntensityAnalyzer() | |
| except Exception as e: | |
| logger.warning(f"Failed to initialize sentiment analyzer: {e}") | |
| def _initialize_ai_models(self): | |
| """Initialize AI models with error handling and fallbacks""" | |
| models_to_try = [ | |
| "facebook/bart-large-cnn", | |
| "t5-small", | |
| "google/pegasus-xsum" | |
| ] | |
| for model_name in models_to_try: | |
| try: | |
| self.summarizer = pipeline( | |
| "summarization", | |
| model=model_name, | |
| device=DEVICE, | |
| torch_dtype=torch.float16 if DEVICE >= 0 else torch.float32 | |
| ) | |
| logger.info(f"Successfully loaded {model_name}") | |
| break | |
| except Exception as e: | |
| logger.warning(f"Failed to load {model_name}: {e}") | |
| continue | |
| def _get_file_hash(self, file_path: str) -> str: | |
| """Generate hash for file caching""" | |
| try: | |
| with open(file_path, 'rb') as f: | |
| content = f.read() | |
| return hashlib.md5(content).hexdigest() | |
| except Exception: | |
| return str(datetime.now().timestamp()) | |
| def extract_text_from_pdf(self, file_path: str) -> str: | |
| """Enhanced PDF text extraction with better error handling""" | |
| text = "" | |
| # Try PyMuPDF first (generally better) | |
| if PYMUPDF_AVAILABLE: | |
| try: | |
| doc = fitz.open(file_path) | |
| for page_num, page in enumerate(doc): | |
| page_text = page.get_text() | |
| if page_text.strip(): # Only add non-empty pages | |
| text += f"\n--- Page {page_num + 1} ---\n{page_text}\n" | |
| doc.close() | |
| if text.strip(): | |
| return text | |
| except Exception as e: | |
| logger.error(f"PyMuPDF extraction failed: {e}") | |
| # Fallback to PyPDF2 | |
| if PDF_AVAILABLE: | |
| try: | |
| with open(file_path, 'rb') as file: | |
| pdf_reader = PyPDF2.PdfReader(file) | |
| for page_num, page in enumerate(pdf_reader.pages): | |
| page_text = page.extract_text() | |
| if page_text.strip(): | |
| text += f"\n--- Page {page_num + 1} ---\n{page_text}\n" | |
| if text.strip(): | |
| return text | |
| except Exception as e: | |
| logger.error(f"PyPDF2 extraction failed: {e}") | |
| return "PDF processing libraries not available or extraction failed." | |
| def extract_text_from_docx(self, file_path: str) -> str: | |
| """Enhanced DOCX extraction with better formatting preservation""" | |
| if not DOCX_AVAILABLE: | |
| return "python-docx library not available." | |
| try: | |
| doc = Document(file_path) | |
| text_parts = [] | |
| # Extract paragraphs | |
| for paragraph in doc.paragraphs: | |
| if paragraph.text.strip(): | |
| text_parts.append(paragraph.text) | |
| # Extract tables | |
| for table_num, table in enumerate(doc.tables): | |
| text_parts.append(f"\n--- Table {table_num + 1} ---") | |
| for row in table.rows: | |
| row_text = " | ".join(cell.text.strip() for cell in row.cells) | |
| if row_text.strip(): | |
| text_parts.append(row_text) | |
| return "\n".join(text_parts) | |
| except Exception as e: | |
| logger.error(f"Error processing DOCX file: {e}") | |
| return f"Error processing DOCX file: {str(e)}" | |
| def get_enhanced_document_stats(self, text: str) -> Dict: | |
| """Get comprehensive document statistics with sentiment analysis""" | |
| if not text.strip(): | |
| return {} | |
| # Basic stats | |
| word_count = len(text.split()) | |
| char_count = len(text) | |
| char_count_no_spaces = len(text.replace(' ', '')) | |
| paragraph_count = len([p for p in text.split('\n\n') if p.strip()]) | |
| stats = { | |
| 'word_count': word_count, | |
| 'character_count': char_count, | |
| 'character_count_no_spaces': char_count_no_spaces, | |
| 'paragraph_count': paragraph_count, | |
| 'estimated_reading_time': max(1, round(word_count / 200)), # 200 WPM average | |
| 'estimated_speaking_time': max(1, round(word_count / 150)) # 150 WPM speaking | |
| } | |
| if NLTK_AVAILABLE: | |
| sentences = sent_tokenize(text) | |
| stats['sentence_count'] = len(sentences) | |
| stats['avg_sentence_length'] = round(word_count / len(sentences), 1) if sentences else 0 | |
| # Word frequency analysis | |
| words = word_tokenize(text.lower()) | |
| stop_words = set(stopwords.words('english')) | |
| filtered_words = [w for w in words if w.isalpha() and w not in stop_words and len(w) > 2] | |
| if filtered_words: | |
| freq_dist = FreqDist(filtered_words) | |
| stats['top_words'] = freq_dist.most_common(15) | |
| stats['unique_words'] = len(set(filtered_words)) | |
| stats['lexical_diversity'] = round(len(set(filtered_words)) / len(filtered_words), 3) if filtered_words else 0 | |
| # Sentiment analysis | |
| if self.sentiment_analyzer: | |
| try: | |
| sentiment_scores = self.sentiment_analyzer.polarity_scores(text[:5000]) # Limit for performance | |
| stats['sentiment'] = { | |
| 'compound': round(sentiment_scores['compound'], 3), | |
| 'positive': round(sentiment_scores['pos'], 3), | |
| 'negative': round(sentiment_scores['neg'], 3), | |
| 'neutral': round(sentiment_scores['neu'], 3) | |
| } | |
| except Exception as e: | |
| logger.error(f"Sentiment analysis failed: {e}") | |
| else: | |
| # Fallback without NLTK | |
| sentences = [s.strip() for s in text.split('.') if s.strip()] | |
| stats['sentence_count'] = len(sentences) | |
| stats['avg_sentence_length'] = round(word_count / len(sentences), 1) if sentences else 0 | |
| words = re.findall(r'\b\w+\b', text.lower()) | |
| word_freq = {} | |
| for word in words: | |
| if len(word) > 2: | |
| word_freq[word] = word_freq.get(word, 0) + 1 | |
| stats['top_words'] = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:15] | |
| stats['unique_words'] = len(set(words)) | |
| return stats | |
| def advanced_extractive_summary(self, text: str, num_sentences: int = 3) -> str: | |
| """Enhanced extractive summarization with improved sentence scoring""" | |
| if not text.strip(): | |
| return "No text to summarize." | |
| if NLTK_AVAILABLE: | |
| sentences = sent_tokenize(text) | |
| else: | |
| sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()] | |
| if len(sentences) <= num_sentences: | |
| return text | |
| # Enhanced sentence scoring | |
| scored_sentences = [] | |
| total_sentences = len(sentences) | |
| # Calculate word frequencies for TF scoring | |
| all_words = re.findall(r'\b\w+\b', text.lower()) | |
| word_freq = {} | |
| for word in all_words: | |
| if len(word) > 2: | |
| word_freq[word] = word_freq.get(word, 0) + 1 | |
| # Important keywords that boost sentence scores | |
| importance_keywords = [ | |
| 'conclusion', 'summary', 'result', 'finding', 'important', 'significant', | |
| 'key', 'main', 'primary', 'essential', 'crucial', 'objective', 'goal', | |
| 'recommendation', 'suggest', 'propose', 'indicate', 'show', 'demonstrate' | |
| ] | |
| for i, sentence in enumerate(sentences): | |
| if len(sentence.split()) < 5: # Skip very short sentences | |
| continue | |
| score = 0 | |
| sentence_lower = sentence.lower() | |
| sentence_words = sentence.split() | |
| # Position scoring (beginning and end are more important) | |
| if i < total_sentences * 0.15: # First 15% | |
| score += 3 | |
| elif i > total_sentences * 0.85: # Last 15% | |
| score += 2 | |
| elif total_sentences * 0.4 <= i <= total_sentences * 0.6: # Middle section | |
| score += 1 | |
| # Length scoring (prefer moderate length) | |
| word_count = len(sentence_words) | |
| if 12 <= word_count <= 25: | |
| score += 3 | |
| elif 8 <= word_count <= 35: | |
| score += 2 | |
| elif 5 <= word_count <= 45: | |
| score += 1 | |
| # Keyword importance scoring | |
| keyword_score = sum(2 if keyword in sentence_lower else 0 for keyword in importance_keywords) | |
| score += min(keyword_score, 6) # Cap keyword bonus | |
| # TF-based scoring (frequency of important words) | |
| tf_score = 0 | |
| for word in sentence_words: | |
| word_lower = word.lower() | |
| if word_lower in word_freq and len(word_lower) > 3: | |
| tf_score += min(word_freq[word_lower], 5) # Cap individual word contribution | |
| score += min(tf_score / len(sentence_words), 3) # Normalize by sentence length | |
| # Structural indicators | |
| if any(indicator in sentence for indicator in [':', 'β', '"', '(']): | |
| score += 1 | |
| # Numerical data (often important) | |
| if re.search(r'\b\d+(?:\.\d+)?%?\b', sentence): | |
| score += 1 | |
| scored_sentences.append((sentence, score, i)) | |
| # Sort by score and select top sentences | |
| scored_sentences.sort(key=lambda x: x[1], reverse=True) | |
| selected_sentences = scored_sentences[:num_sentences] | |
| # Sort selected sentences by original position to maintain flow | |
| selected_sentences.sort(key=lambda x: x[2]) | |
| return ' '.join([s[0] for s in selected_sentences]) | |
| def intelligent_chunking(self, text: str, max_chunk_size: int = 1024) -> List[str]: | |
| """Intelligently chunk text while preserving semantic boundaries""" | |
| if len(text) <= max_chunk_size: | |
| return [text] | |
| chunks = [] | |
| # Try to split by double newlines first (paragraphs) | |
| paragraphs = text.split('\n\n') | |
| current_chunk = "" | |
| for paragraph in paragraphs: | |
| # If single paragraph is too long, split by sentences | |
| if len(paragraph) > max_chunk_size: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = "" | |
| # Split long paragraph by sentences | |
| if NLTK_AVAILABLE: | |
| sentences = sent_tokenize(paragraph) | |
| else: | |
| sentences = [s.strip() for s in paragraph.split('.') if s.strip()] | |
| temp_chunk = "" | |
| for sentence in sentences: | |
| if len(temp_chunk + sentence) <= max_chunk_size: | |
| temp_chunk += sentence + ". " | |
| else: | |
| if temp_chunk: | |
| chunks.append(temp_chunk.strip()) | |
| temp_chunk = sentence + ". " | |
| if temp_chunk: | |
| current_chunk = temp_chunk | |
| else: | |
| # Normal paragraph processing | |
| if len(current_chunk + paragraph) <= max_chunk_size: | |
| current_chunk += paragraph + "\n\n" | |
| else: | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = paragraph + "\n\n" | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return [chunk for chunk in chunks if chunk.strip()] | |
| def ai_summary(self, text: str, max_length: int = 150, min_length: int = 50) -> str: | |
| """Enhanced AI-powered summarization with better chunking and error handling""" | |
| if not self.summarizer: | |
| return self.advanced_extractive_summary(text) | |
| try: | |
| # Intelligent chunking | |
| chunks = self.intelligent_chunking(text, 1000) # Slightly smaller chunks for better quality | |
| if not chunks: | |
| return "No meaningful content found for summarization." | |
| summaries = [] | |
| for i, chunk in enumerate(chunks): | |
| if len(chunk.strip()) < 50: # Skip very short chunks | |
| continue | |
| try: | |
| # Adjust parameters based on chunk size | |
| chunk_max_length = min(max_length, max(50, len(chunk.split()) // 3)) | |
| chunk_min_length = min(min_length, chunk_max_length // 2) | |
| summary = self.summarizer( | |
| chunk, | |
| max_length=chunk_max_length, | |
| min_length=chunk_min_length, | |
| do_sample=False, | |
| truncation=True | |
| ) | |
| summaries.append(summary[0]['summary_text']) | |
| except Exception as e: | |
| logger.warning(f"Error summarizing chunk {i}: {e}") | |
| # Fallback to extractive summary for this chunk | |
| fallback_summary = self.advanced_extractive_summary(chunk, 2) | |
| if fallback_summary and fallback_summary != "No text to summarize.": | |
| summaries.append(fallback_summary) | |
| if not summaries: | |
| return self.advanced_extractive_summary(text) | |
| # Combine and refine summaries | |
| if len(summaries) == 1: | |
| return summaries[0] | |
| else: | |
| combined_summary = ' '.join(summaries) | |
| # If combined summary is still too long, summarize again | |
| if len(combined_summary.split()) > max_length * 1.5: | |
| try: | |
| final_summary = self.summarizer( | |
| combined_summary, | |
| max_length=max_length, | |
| min_length=min_length, | |
| do_sample=False, | |
| truncation=True | |
| ) | |
| return final_summary[0]['summary_text'] | |
| except Exception: | |
| return combined_summary[:max_length * 10] # Rough character limit fallback | |
| return combined_summary | |
| except Exception as e: | |
| logger.error(f"AI summarization failed: {e}") | |
| return self.advanced_extractive_summary(text) | |
| def generate_enhanced_key_points(self, text: str, num_points: int = 7) -> List[str]: | |
| """Generate key points with improved extraction and categorization""" | |
| if not text.strip(): | |
| return [] | |
| if NLTK_AVAILABLE: | |
| sentences = sent_tokenize(text) | |
| else: | |
| sentences = [s.strip() for s in re.split(r'[.!?]+', text) if s.strip()] | |
| # Enhanced key point indicators with categories | |
| key_indicators = { | |
| 'conclusions': ['conclusion', 'conclude', 'result', 'outcome', 'finding', 'discovered'], | |
| 'objectives': ['objective', 'goal', 'purpose', 'aim', 'target', 'mission'], | |
| 'methods': ['method', 'approach', 'technique', 'procedure', 'process', 'way'], | |
| 'importance': ['important', 'significant', 'crucial', 'essential', 'key', 'main', 'primary'], | |
| 'recommendations': ['recommend', 'suggest', 'propose', 'should', 'must', 'need to'], | |
| 'problems': ['problem', 'issue', 'challenge', 'difficulty', 'obstacle', 'concern'], | |
| 'benefits': ['benefit', 'advantage', 'improvement', 'enhancement', 'positive', 'gain'] | |
| } | |
| scored_sentences = [] | |
| for sentence in sentences: | |
| if len(sentence.split()) < 6: # Skip very short sentences | |
| continue | |
| score = 0 | |
| sentence_lower = sentence.lower() | |
| category = 'general' | |
| # Category-based scoring | |
| for cat, indicators in key_indicators.items(): | |
| category_score = sum(2 if indicator in sentence_lower else 0 for indicator in indicators) | |
| if category_score > score: | |
| score = category_score | |
| category = cat | |
| # Structural scoring | |
| if sentence.strip().startswith(('β’', '-', '1.', '2.', '3.', '4.', '5.')): | |
| score += 4 | |
| # Punctuation indicators | |
| if any(punct in sentence for punct in [':', ';', 'β', '"']): | |
| score += 1 | |
| # Length scoring (prefer moderate length for key points) | |
| word_count = len(sentence.split()) | |
| if 8 <= word_count <= 20: | |
| score += 3 | |
| elif 6 <= word_count <= 30: | |
| score += 2 | |
| elif 4 <= word_count <= 40: | |
| score += 1 | |
| # Numerical data bonus | |
| if re.search(r'\b\d+(?:\.\d+)?%?\b', sentence): | |
| score += 2 | |
| # Avoid very generic sentences | |
| generic_words = ['the', 'this', 'that', 'there', 'it', 'they'] | |
| if sentence.split()[0].lower() in generic_words: | |
| score -= 1 | |
| if score > 0: | |
| scored_sentences.append((sentence.strip(), score, category)) | |
| # Sort by score and diversify by category | |
| scored_sentences.sort(key=lambda x: x[1], reverse=True) | |
| # Select diverse key points | |
| selected_points = [] | |
| used_categories = set() | |
| # First pass: get the highest scoring point from each category | |
| for sentence, score, category in scored_sentences: | |
| if len(selected_points) >= num_points: | |
| break | |
| if category not in used_categories: | |
| selected_points.append(sentence) | |
| used_categories.add(category) | |
| # Second pass: fill remaining slots with highest scoring sentences | |
| for sentence, score, category in scored_sentences: | |
| if len(selected_points) >= num_points: | |
| break | |
| if sentence not in selected_points: | |
| selected_points.append(sentence) | |
| return selected_points[:num_points] | |
| def generate_document_outline(self, text: str) -> List[str]: | |
| """Generate a structured outline of the document""" | |
| if not text.strip(): | |
| return [] | |
| lines = text.split('\n') | |
| outline = [] | |
| # Look for headers, numbered sections, etc. | |
| header_patterns = [ | |
| r'^#{1,6}\s+(.+)$', # Markdown headers | |
| r'^(\d+\.?\s+[A-Z][^.]{10,})$', # Numbered sections | |
| r'^([A-Z][A-Z\s]{5,})$', # ALL CAPS headers | |
| r'^([A-Z][a-z\s]{10,}:)$', # Title Case with colon | |
| ] | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| for pattern in header_patterns: | |
| match = re.match(pattern, line) | |
| if match: | |
| outline.append(match.group(1).strip()) | |
| break | |
| return outline[:10] # Limit to 10 outline items | |
| def process_document(self, file_path: str, summary_type: str = "ai", | |
| summary_length: str = "medium") -> Tuple[Optional[Dict], Optional[str]]: | |
| """Enhanced document processing with caching and comprehensive analysis""" | |
| if not file_path: | |
| return None, "No file provided." | |
| try: | |
| # Check cache | |
| file_hash = self._get_file_hash(file_path) | |
| cache_key = f"{file_hash}_{summary_type}_{summary_length}" | |
| if cache_key in self.cache: | |
| logger.info("Returning cached result") | |
| return self.cache[cache_key], None | |
| # Extract text based on file type | |
| file_extension = Path(file_path).suffix.lower() | |
| if file_extension == '.pdf': | |
| text = self.extract_text_from_pdf(file_path) | |
| elif file_extension == '.docx': | |
| text = self.extract_text_from_docx(file_path) | |
| elif file_extension in ['.txt', '.md', '.rtf']: | |
| with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
| text = f.read() | |
| else: | |
| return None, f"Unsupported file type: {file_extension}" | |
| if not text.strip() or "not available" in text.lower(): | |
| return None, "No text could be extracted from the document or extraction failed." | |
| # Clean text | |
| text = re.sub(r'\n{3,}', '\n\n', text) # Reduce excessive newlines | |
| text = re.sub(r' {2,}', ' ', text) # Reduce excessive spaces | |
| # Get comprehensive statistics | |
| stats = self.get_enhanced_document_stats(text) | |
| # Generate summary based on type and length | |
| length_params = { | |
| "short": {"sentences": 2, "max_length": 80, "min_length": 30}, | |
| "medium": {"sentences": 4, "max_length": 150, "min_length": 50}, | |
| "long": {"sentences": 6, "max_length": 250, "min_length": 100}, | |
| "detailed": {"sentences": 8, "max_length": 400, "min_length": 150} | |
| } | |
| params = length_params.get(summary_length, length_params["medium"]) | |
| # Generate summary | |
| if summary_type == "ai" and self.summarizer: | |
| summary = self.ai_summary(text, params["max_length"], params["min_length"]) | |
| else: | |
| summary = self.advanced_extractive_summary(text, params["sentences"]) | |
| # Generate enhanced features | |
| key_points = self.generate_enhanced_key_points(text, 7) | |
| outline = self.generate_document_outline(text) | |
| # Calculate readability (simple approximation) | |
| avg_sentence_length = stats.get('avg_sentence_length', 0) | |
| readability_score = max(0, min(100, 100 - (avg_sentence_length * 2))) | |
| result = { | |
| 'original_text': text[:2000] + "..." if len(text) > 2000 else text, # Truncate for display | |
| 'full_text_length': len(text), | |
| 'summary': summary, | |
| 'key_points': key_points, | |
| 'outline': outline, | |
| 'stats': stats, | |
| 'readability_score': readability_score, | |
| 'file_name': Path(file_path).name, | |
| 'file_size': os.path.getsize(file_path), | |
| 'processing_time': datetime.now().isoformat(), | |
| 'summary_type': summary_type, | |
| 'summary_length': summary_length, | |
| 'model_used': 'AI (BART/T5)' if self.summarizer else 'Extractive' | |
| } | |
| # Cache result | |
| self.cache[cache_key] = result | |
| return result, None | |
| except Exception as e: | |
| logger.error(f"Document processing error: {e}") | |
| return None, f"Error processing document: {str(e)}" | |
| def create_catalyst_interface(): | |
| """Create the CatalystGPT-4 document summarizer interface""" | |
| summarizer = AdvancedDocumentSummarizer() | |
| # Enhanced CSS with modern styling | |
| css = """ | |
| .catalyst-header { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| padding: 30px; | |
| border-radius: 20px; | |
| text-align: center; | |
| margin-bottom: 25px; | |
| box-shadow: 0 10px 30px rgba(0,0,0,0.2); | |
| } | |
| .summary-container { | |
| background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); | |
| color: white; | |
| padding: 25px; | |
| border-radius: 15px; | |
| margin: 15px 0; | |
| box-shadow: 0 8px 25px rgba(0,0,0,0.15); | |
| } | |
| .stats-container { | |
| background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); | |
| color: white; | |
| padding: 20px; | |
| border-radius: 12px; | |
| margin: 15px 0; | |
| box-shadow: 0 6px 20px rgba(0,0,0,0.1); | |
| } | |
| .key-points-container { | |
| background: linear-gradient(135deg, #4ecdc4 0%, #44a08d 100%); | |
| color: white; | |
| padding: 20px; | |
| border-radius: 12px; | |
| margin: 15px 0; | |
| box-shadow: 0 6px 20px rgba(0,0,0,0.1); | |
| } | |
| .outline-container { | |
| background: linear-gradient(135deg, #fa709a 0%, #fee140 100%); | |
| color: white; | |
| padding: 20px; | |
| border-radius: 12px; | |
| margin: 15px 0; | |
| box-shadow: 0 6px 20px rgba(0,0,0,0.1); | |
| } | |
| .error-container { | |
| background: linear-gradient(135deg, #ff9a9e 0%, #fecfef 100%); | |
| color: #721c24; | |
| padding: 20px; | |
| border-radius: 12px; | |
| margin: 15px 0; | |
| border-left: 5px solid #dc3545; | |
| } | |
| .control-panel { | |
| background: linear-gradient(135deg, #f6f9fc 0%, #e9ecef 100%); | |
| padding: 25px; | |
| border-radius: 15px; | |
| margin: 15px 0; | |
| border: 1px solid #dee2e6; | |
| box-shadow: 0 4px 15px rgba(0,0,0,0.05); | |
| } | |
| .file-upload-area { | |
| border: 3px dashed #007bff; | |
| border-radius: 15px; | |
| padding: 40px; | |
| text-align: center; | |
| background: linear-gradient(135deg, #f8f9ff 0%, #e3f2fd 100%); | |
| transition: all 0.3s ease; | |
| margin: 15px 0; | |
| } | |
| .file-upload-area:hover { | |
| border-color: #0056b3; | |
| background: linear-gradient(135deg, #f0f7ff 0%, #e1f5fe 100%); | |
| transform: translateY(-2px); | |
| } | |
| .metric-card { | |
| background: white; | |
| padding: 15px; | |
| border-radius: 10px; | |
| margin: 5px; | |
| box-shadow: 0 2px 8px rgba(0,0,0,0.1); | |
| text-align: center; | |
| } | |
| .sentiment-indicator { | |
| display: inline-block; | |
| padding: 5px 12px; | |
| border-radius: 20px; | |
| font-weight: bold; | |
| font-size: 12px; | |
| margin: 2px; | |
| } | |
| .sentiment-positive { background: #d4edda; color: #155724; } | |
| .sentiment-negative { background: #f8d7da; color: #721c24; } | |
| .sentiment-neutral { background: #d1ecf1; color: #0c5460; } | |
| .progress-bar { | |
| background: #e9ecef; | |
| border-radius: 10px; | |
| overflow: hidden; | |
| height: 8px; | |
| margin: 5px 0; | |
| } | |
| .progress-fill { | |
| height: 100%; | |
| background: linear-gradient(90deg, #28a745, #20c997); | |
| transition: width 0.3s ease; | |
| } | |
| """ | |
| def format_file_size(size_bytes): | |
| """Convert bytes to human readable format""" | |
| for unit in ['B', 'KB', 'MB', 'GB']: | |
| if size_bytes < 1024.0: | |
| return f"{size_bytes:.1f} {unit}" | |
| size_bytes /= 1024.0 | |
| return f"{size_bytes:.1f} TB" | |
| def get_sentiment_indicator(sentiment_score): | |
| """Get sentiment indicator HTML""" | |
| if sentiment_score > 0.1: | |
| return '<span class="sentiment-indicator sentiment-positive">Positive</span>' | |
| elif sentiment_score < -0.1: | |
| return '<span class="sentiment-indicator sentiment-negative">Negative</span>' | |
| else: | |
| return '<span class="sentiment-indicator sentiment-neutral">Neutral</span>' | |
| def process_and_display(file, summary_type, summary_length, enable_ai_features): | |
| """Enhanced processing with comprehensive results display""" | |
| if file is None: | |
| return ( | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(value=""" | |
| <div style="text-align: center; padding: 60px; color: #666;"> | |
| <h3>CatalystGPT-4 Ready</h3> | |
| <p>Upload a document to begin advanced AI-powered analysis</p> | |
| <p><small>Supports: PDF, Word (.docx), Text (.txt, .md, .rtf)</small></p> | |
| </div> | |
| """, visible=True) | |
| ) | |
| try: | |
| # Use AI features based on toggle | |
| actual_summary_type = summary_type if enable_ai_features else "extractive" | |
| result, error = summarizer.process_document(file.name, actual_summary_type, summary_length) | |
| if error: | |
| error_html = f''' | |
| <div class="error-container"> | |
| <h4>Processing Error</h4> | |
| <p><strong>Error:</strong> {error}</p> | |
| <p><small>Please try a different file or check the file format.</small></p> | |
| </div> | |
| ''' | |
| return ( | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(value=error_html, visible=True) | |
| ) | |
| # Format summary display | |
| summary_html = f''' | |
| <div class="summary-container"> | |
| <h3>Document Summary</h3> | |
| <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 15px; margin-bottom: 15px;"> | |
| <div><strong>File:</strong> {result["file_name"]}</div> | |
| <div><strong>Size:</strong> {format_file_size(result["file_size"])}</div> | |
| <div><strong>Model:</strong> {result["model_used"]}</div> | |
| <div><strong>Length:</strong> {result["summary_length"].title()}</div> | |
| </div> | |
| <div style="background: rgba(255,255,255,0.15); padding: 20px; border-radius: 10px; line-height: 1.6;"> | |
| {result["summary"]} | |
| </div> | |
| </div> | |
| ''' | |
| # Format comprehensive statistics | |
| stats = result["stats"] | |
| readability = result["readability_score"] | |
| # Create readability indicator | |
| readability_color = "#28a745" if readability > 70 else "#ffc107" if readability > 40 else "#dc3545" | |
| readability_text = "Easy" if readability > 70 else "Moderate" if readability > 40 else "Complex" | |
| stats_html = f''' | |
| <div class="stats-container"> | |
| <h3>Document Analytics</h3> | |
| <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 20px 0;"> | |
| <div class="metric-card"> | |
| <h4 style="margin: 0; color: #007bff;">{stats["word_count"]:,}</h4> | |
| <small>Words</small> | |
| </div> | |
| <div class="metric-card"> | |
| <h4 style="margin: 0; color: #28a745;">{stats["estimated_reading_time"]} min</h4> | |
| <small>Reading Time</small> | |
| </div> | |
| <div class="metric-card"> | |
| <h4 style="margin: 0; color: #17a2b8;">{stats["sentence_count"]:,}</h4> | |
| <small>Sentences</small> | |
| </div> | |
| <div class="metric-card"> | |
| <h4 style="margin: 0; color: #6f42c1;">{stats.get("unique_words", "N/A")}</h4> | |
| <small>Unique Words</small> | |
| </div> | |
| </div> | |
| <div style="margin: 20px 0;"> | |
| <h4>Readability Score</h4> | |
| <div class="progress-bar"> | |
| <div class="progress-fill" style="width: {readability}%; background-color: {readability_color};"></div> | |
| </div> | |
| <p><strong>{readability:.1f}/100</strong> - {readability_text} to read</p> | |
| </div> | |
| ''' | |
| # Add sentiment analysis if available | |
| if stats.get('sentiment'): | |
| sentiment = stats['sentiment'] | |
| sentiment_html = get_sentiment_indicator(sentiment['compound']) | |
| stats_html += f''' | |
| <div style="margin: 20px 0;"> | |
| <h4>Document Sentiment</h4> | |
| {sentiment_html} | |
| <div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 10px; margin-top: 10px;"> | |
| <small>Positive: {sentiment['positive']:.2f}</small> | |
| <small>Negative: {sentiment['negative']:.2f}</small> | |
| <small>Neutral: {sentiment['neutral']:.2f}</small> | |
| </div> | |
| </div> | |
| ''' | |
| # Add word frequency | |
| if stats.get('top_words'): | |
| stats_html += f''' | |
| <div style="margin: 20px 0;"> | |
| <h4>Most Frequent Words</h4> | |
| <div style="display: flex; flex-wrap: wrap; gap: 8px; margin-top: 10px;"> | |
| {" ".join([f'<span style="background: rgba(255,255,255,0.2); padding: 6px 12px; border-radius: 15px; font-size: 13px;">{word} ({count})</span>' for word, count in stats["top_words"][:10]])} | |
| </div> | |
| </div> | |
| ''' | |
| stats_html += '</div>' | |
| # Format key points | |
| key_points_html = f''' | |
| <div class="key-points-container"> | |
| <h3>Key Insights</h3> | |
| <ul style="list-style: none; padding: 0;"> | |
| ''' | |
| for i, point in enumerate(result["key_points"], 1): | |
| key_points_html += f'<li style="margin-bottom: 12px; padding: 10px; background: rgba(255,255,255,0.15); border-radius: 8px;"><strong>{i}.</strong> {point}</li>' | |
| key_points_html += '</ul></div>' | |
| # Format document outline | |
| outline_html = "" | |
| if result.get("outline"): | |
| outline_html = f''' | |
| <div class="outline-container"> | |
| <h3>Document Structure</h3> | |
| <ol style="padding-left: 20px;"> | |
| ''' | |
| for item in result["outline"]: | |
| outline_html += f'<li style="margin-bottom: 8px; padding: 5px 0;">{item}</li>' | |
| outline_html += '</ol></div>' | |
| return ( | |
| gr.update(value=summary_html, visible=True), | |
| gr.update(value=stats_html, visible=True), | |
| gr.update(value=key_points_html, visible=True), | |
| gr.update(value=outline_html, visible=True if outline_html else False), | |
| gr.update(visible=False) | |
| ) | |
| except Exception as e: | |
| error_html = f''' | |
| <div class="error-container"> | |
| <h4>Unexpected Error</h4> | |
| <p><strong>Details:</strong> {str(e)}</p> | |
| <p><small>Please try again or contact support if the issue persists.</small></p> | |
| </div> | |
| ''' | |
| return ( | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(value=error_html, visible=True) | |
| ) | |
| # Create the main interface | |
| with gr.Blocks(css=css, title="CatalystGPT-4 Document Summarizer", theme=gr.themes.Soft()) as demo: | |
| # Header | |
| gr.HTML(""" | |
| <div class="catalyst-header"> | |
| <h1 style="margin: 0; font-size: 3em; font-weight: bold;">CatalystGPT-4</h1> | |
| <h2 style="margin: 10px 0; font-size: 1.5em; opacity: 0.9;">Advanced Document Summarizer</h2> | |
| <p style="margin: 15px 0 0 0; font-size: 1.1em; opacity: 0.8;"> | |
| Powered by AI β’ Extractive & Abstractive Summarization β’ Comprehensive Analytics | |
| </p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| # Left column - Enhanced Controls | |
| with gr.Column(scale=1): | |
| with gr.Group(): | |
| gr.HTML('<div class="control-panel">') | |
| gr.Markdown("### Document Upload") | |
| file_upload = gr.File( | |
| label="Choose your document", | |
| file_types=[".pdf", ".docx", ".txt", ".md", ".rtf"], | |
| elem_classes="file-upload-area" | |
| ) | |
| gr.Markdown("### Analysis Settings") | |
| enable_ai_features = gr.Checkbox( | |
| label="Enable AI Features", | |
| value=TRANSFORMERS_AVAILABLE, | |
| info="Use advanced AI models for better summarization", | |
| interactive=TRANSFORMERS_AVAILABLE | |
| ) | |
| summary_type = gr.Radio( | |
| choices=[ | |
| ("AI Summary (Neural)", "ai"), | |
| ("Extractive Summary", "extractive") | |
| ], | |
| value="ai" if TRANSFORMERS_AVAILABLE else "extractive", | |
| label="Summarization Method", | |
| info="AI generates new text, Extractive selects key sentences" | |
| ) | |
| summary_length = gr.Radio( | |
| choices=[ | |
| ("Short & Concise", "short"), | |
| ("Standard Length", "medium"), | |
| ("Detailed Analysis", "long"), | |
| ("Comprehensive Report", "detailed") | |
| ], | |
| value="medium", | |
| label="Analysis Depth", | |
| info="Choose the level of detail for your analysis" | |
| ) | |
| analyze_btn = gr.Button( | |
| "Analyze Document", | |
| variant="primary", | |
| size="lg", | |
| elem_classes="analyze-button" | |
| ) | |
| gr.HTML('</div>') | |
| # Enhanced Library Status | |
| gr.Markdown(f""" | |
| ### System Status | |
| **Core Features:** | |
| - **PDF Processing:** {"β PyMuPDF" if PYMUPDF_AVAILABLE else ("β PyPDF2" if PDF_AVAILABLE else "β Not Available")} | |
| - **Word Documents:** {"β Available" if DOCX_AVAILABLE else "β Install python-docx"} | |
| - **AI Summarization:** {"β Available" if TRANSFORMERS_AVAILABLE else "β Install transformers"} | |
| - **Advanced NLP:** {"β Available" if NLTK_AVAILABLE else "β οΈ Basic processing"} | |
| - **Sentiment Analysis:** {"β Available" if (NLTK_AVAILABLE and summarizer.sentiment_analyzer) else "β Not Available"} | |
| **Performance:** | |
| - **Device:** {"GPU" if DEVICE >= 0 else "CPU"} | |
| - **Cache:** {"Enabled" if summarizer.cache is not None else "Disabled"} | |
| """) | |
| # Right column - Enhanced Results | |
| with gr.Column(scale=2): | |
| # Welcome message | |
| welcome_msg = gr.HTML( | |
| value=""" | |
| <div style="text-align: center; padding: 80px 20px; color: #666;"> | |
| <div style="font-size: 4em; margin-bottom: 20px;">π</div> | |
| <h2 style="color: #333; margin-bottom: 15px;">Ready for Analysis</h2> | |
| <p style="font-size: 1.1em; margin-bottom: 10px;">Upload any document to unlock AI-powered insights</p> | |
| <p><small style="color: #888;">Supports PDF, Word, Text, Markdown, and RTF files</small></p> | |
| <div style="margin-top: 30px; padding: 20px; background: #f8f9fa; border-radius: 10px; display: inline-block;"> | |
| <strong>Features:</strong> AI Summarization β’ Key Points β’ Analytics β’ Sentiment Analysis | |
| </div> | |
| </div> | |
| """, | |
| visible=True | |
| ) | |
| # Results sections | |
| summary_display = gr.HTML(visible=False) | |
| stats_display = gr.HTML(visible=False) | |
| key_points_display = gr.HTML(visible=False) | |
| outline_display = gr.HTML(visible=False) | |
| error_display = gr.HTML(visible=False) | |
| # Event handlers | |
| def on_file_change(file): | |
| if file is None: | |
| return ( | |
| gr.update(visible=True), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False) | |
| ) | |
| else: | |
| return ( | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False) | |
| ) | |
| # Auto-hide welcome when file uploaded | |
| file_upload.change( | |
| fn=on_file_change, | |
| inputs=[file_upload], | |
| outputs=[welcome_msg, summary_display, stats_display, key_points_display, outline_display, error_display] | |
| ) | |
| # Process document on button click | |
| analyze_btn.click( | |
| fn=process_and_display, | |
| inputs=[file_upload, summary_type, summary_length, enable_ai_features], | |
| outputs=[summary_display, stats_display, key_points_display, outline_display, error_display] | |
| ) | |
| # Auto-process when settings change (if file uploaded) | |
| for component in [summary_type, summary_length, enable_ai_features]: | |
| component.change( | |
| fn=process_and_display, | |
| inputs=[file_upload, summary_type, summary_length, enable_ai_features], | |
| outputs=[summary_display, stats_display, key_points_display, outline_display, error_display] | |
| ) | |
| # Enhanced Footer | |
| gr.HTML(""" | |
| <div style="margin-top: 50px; padding: 30px; background: linear-gradient(135deg, #f8f9fa 0%, #e9ecef 100%); | |
| border-radius: 15px; text-align: center; border-top: 3px solid #007bff;"> | |
| <h3 style="color: #333; margin-bottom: 20px;">Installation & Setup</h3> | |
| <div style="background: #343a40; color: #fff; padding: 15px; border-radius: 8px; | |
| font-family: 'Courier New', monospace; margin: 15px 0;"> | |
| <strong>Quick Install:</strong><br> | |
| pip install gradio python-docx PyPDF2 transformers torch nltk PyMuPDF | |
| </div> | |
| <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin-top: 20px;"> | |
| <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);"> | |
| <strong>Core Features</strong><br> | |
| <small>Multi-format support, AI summarization, key insights extraction</small> | |
| </div> | |
| <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);"> | |
| <strong>Advanced Analytics</strong><br> | |
| <small>Sentiment analysis, readability scoring, word frequency</small> | |
| </div> | |
| <div style="background: white; padding: 15px; border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);"> | |
| <strong>Performance</strong><br> | |
| <small>Intelligent caching, GPU acceleration, batch processing</small> | |
| </div> | |
| </div> | |
| <p style="margin-top: 20px; color: #666;"> | |
| <strong>CatalystGPT-4</strong> - Advanced Document Analysis Platform | |
| </p> | |
| </div> | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| demo = create_catalyst_interface() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) |