Spaces:
Running
on
Zero
Running
on
Zero
| import re | |
| import string | |
| from typing import Dict, List, Tuple, Optional, Any | |
| import traceback | |
| class NaturalLanguageProcessor: | |
| """ | |
| Natural language processing utility class | |
| Handles text preprocessing and keyword extraction for user input | |
| """ | |
| def __init__(self): | |
| """Initialize the natural language processor""" | |
| self.stop_words = { | |
| 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from', | |
| 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the', | |
| 'to', 'was', 'will', 'with', 'would', 'i', 'me', 'my', 'we', 'us', | |
| 'our', 'you', 'your', 'they', 'them', 'their' | |
| } | |
| # Breed name mappings (common aliases to standard names) | |
| self.breed_aliases = { | |
| 'lab': 'labrador_retriever', | |
| 'labrador': 'labrador_retriever', | |
| 'golden': 'golden_retriever', | |
| 'retriever': ['labrador_retriever', 'golden_retriever'], | |
| 'german shepherd': 'german_shepherd', | |
| 'shepherd': 'german_shepherd', | |
| 'border collie': 'border_collie', | |
| 'collie': ['border_collie', 'collie'], | |
| 'bulldog': ['french_bulldog', 'english_bulldog'], | |
| 'french bulldog': 'french_bulldog', | |
| 'poodle': ['standard_poodle', 'miniature_poodle', 'toy_poodle'], | |
| 'husky': 'siberian_husky', | |
| 'siberian husky': 'siberian_husky', | |
| 'beagle': 'beagle', | |
| 'yorkshire terrier': 'yorkshire_terrier', | |
| 'yorkie': 'yorkshire_terrier', | |
| 'chihuahua': 'chihuahua', | |
| 'dachshund': 'dachshund', | |
| 'wiener dog': 'dachshund', | |
| 'rottweiler': 'rottweiler', | |
| 'rottie': 'rottweiler', | |
| 'boxer': 'boxer', | |
| 'great dane': 'great_dane', | |
| 'dane': 'great_dane', | |
| 'mastiff': ['bull_mastiff', 'tibetan_mastiff'], | |
| 'pitbull': 'american_staffordshire_terrier', | |
| 'pit bull': 'american_staffordshire_terrier', | |
| 'shih tzu': 'shih-tzu', | |
| 'maltese': 'maltese_dog', | |
| 'pug': 'pug', | |
| 'basset hound': 'basset', | |
| 'bloodhound': 'bloodhound', | |
| 'australian shepherd': 'kelpie', | |
| 'aussie': 'kelpie' | |
| } | |
| # Lifestyle keyword mappings | |
| self.lifestyle_keywords = { | |
| 'living_space': { | |
| 'apartment': ['apartment', 'flat', 'condo', 'small space', 'city living', 'urban'], | |
| 'house': ['house', 'home', 'yard', 'garden', 'suburban', 'large space'], | |
| 'farm': ['farm', 'rural', 'country', 'acreage', 'ranch'] | |
| }, | |
| 'activity_level': { | |
| 'very_high': ['very active', 'extremely energetic', 'marathon runner', 'athlete'], | |
| 'high': ['active', 'energetic', 'exercise', 'hiking', 'running', 'outdoor activities', | |
| 'sports', 'jogging', 'biking', 'adventure'], | |
| 'moderate': ['moderate exercise', 'some activity', 'weekend walks', 'occasional exercise'], | |
| 'low': ['calm', 'lazy', 'indoor', 'low energy', 'couch potato', 'sedentary', 'quiet lifestyle'] | |
| }, | |
| 'family_situation': { | |
| 'children': ['children', 'kids', 'toddlers', 'babies', 'family with children', 'young family'], | |
| 'elderly': ['elderly', 'senior', 'old', 'retired', 'senior citizen'], | |
| 'single': ['single', 'alone', 'individual', 'bachelor', 'solo'], | |
| 'couple': ['couple', 'two people', 'pair', 'duo'] | |
| }, | |
| 'noise_tolerance': { | |
| 'low': ['quiet', 'silent', 'noise-sensitive', 'peaceful', 'no barking', 'minimal noise'], | |
| 'moderate': ['some noise ok', 'moderate barking', 'normal noise'], | |
| 'high': ['loud ok', 'barking fine', 'noise tolerant', 'doesn\'t mind noise'] | |
| }, | |
| 'size_preference': { | |
| 'small': ['small', 'tiny', 'little', 'compact', 'lap dog', 'petite', 'miniature'], | |
| 'medium': ['medium', 'moderate size', 'average', 'mid-size'], | |
| 'large': ['large', 'big', 'huge', 'giant', 'massive', 'substantial'], | |
| 'varies': ['any size', 'size doesn\'t matter', 'flexible on size'] | |
| }, | |
| 'experience_level': { | |
| 'beginner': ['first time', 'beginner', 'new to dogs', 'inexperienced', 'never had'], | |
| 'some': ['some experience', 'had dogs before', 'moderate experience'], | |
| 'experienced': ['experienced', 'expert', 'very experienced', 'professional', 'trainer'] | |
| }, | |
| 'grooming_commitment': { | |
| 'low': ['low maintenance', 'easy care', 'minimal grooming', 'wash and go'], | |
| 'moderate': ['moderate grooming', 'some brushing', 'regular care'], | |
| 'high': ['high maintenance', 'lots of grooming', 'professional grooming', 'daily brushing'] | |
| }, | |
| 'special_needs': { | |
| 'guard': ['guard dog', 'protection', 'security', 'watchdog', 'guardian'], | |
| 'therapy': ['therapy dog', 'emotional support', 'comfort', 'calm companion'], | |
| 'hypoallergenic': ['hypoallergenic', 'allergies', 'non-shedding', 'allergy friendly'], | |
| 'working': ['working dog', 'job', 'task', 'service dog'], | |
| 'companion': ['companion', 'friend', 'buddy', 'lap dog', 'cuddle'] | |
| } | |
| } | |
| # Comparative preference keywords | |
| self.preference_indicators = { | |
| 'love': 1.0, | |
| 'prefer': 0.9, | |
| 'like': 0.8, | |
| 'want': 0.8, | |
| 'interested in': 0.7, | |
| 'considering': 0.6, | |
| 'ok with': 0.5, | |
| 'don\'t mind': 0.4, | |
| 'not interested': 0.2, | |
| 'dislike': 0.1, | |
| 'hate': 0.0 | |
| } | |
| # Order keywords | |
| self.order_keywords = { | |
| 'first': 1.0, 'most': 1.0, 'primary': 1.0, 'main': 1.0, | |
| 'second': 0.8, 'then': 0.8, 'next': 0.8, | |
| 'third': 0.6, 'also': 0.6, 'additionally': 0.6, | |
| 'last': 0.4, 'least': 0.4, 'finally': 0.4 | |
| } | |
| def preprocess_text(self, text: str) -> str: | |
| """ | |
| Text preprocessing | |
| Args: | |
| text: Raw text | |
| Returns: | |
| Preprocessed text | |
| """ | |
| if not text: | |
| return "" | |
| # Convert to lowercase | |
| text = text.lower().strip() | |
| # Remove punctuation (keep some meaningful ones) | |
| text = re.sub(r'[^\w\s\-\']', ' ', text) | |
| # Handle extra whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| return text | |
| def extract_breed_mentions(self, text: str) -> List[Tuple[str, float]]: | |
| """ | |
| Extract mentioned breeds and their preference levels from text | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of (breed_name, preference_score) tuples | |
| """ | |
| text = self.preprocess_text(text) | |
| breed_mentions = [] | |
| try: | |
| # Check each breed alias | |
| for alias, standard_breed in self.breed_aliases.items(): | |
| if alias in text: | |
| # Find surrounding preference indicators | |
| preference_score = self._find_preference_score(text, alias) | |
| if isinstance(standard_breed, list): | |
| # If alias maps to multiple breeds, add all | |
| for breed in standard_breed: | |
| breed_mentions.append((breed, preference_score)) | |
| else: | |
| breed_mentions.append((standard_breed, preference_score)) | |
| # Deduplicate and merge scores | |
| breed_scores = {} | |
| for breed, score in breed_mentions: | |
| if breed in breed_scores: | |
| breed_scores[breed] = max(breed_scores[breed], score) | |
| else: | |
| breed_scores[breed] = score | |
| return list(breed_scores.items()) | |
| except Exception as e: | |
| print(f"Error extracting breed mentions: {str(e)}") | |
| return [] | |
| def _find_preference_score(self, text: str, breed_mention: str) -> float: | |
| """ | |
| Find preference score near breed mention | |
| Args: | |
| text: Text | |
| breed_mention: Breed mention | |
| Returns: | |
| Preference score (0.0-1.0) | |
| """ | |
| try: | |
| # Find breed mention position | |
| mention_pos = text.find(breed_mention) | |
| if mention_pos == -1: | |
| return 0.5 # Default neutral score | |
| # Check context (50 characters before and after) | |
| context_start = max(0, mention_pos - 50) | |
| context_end = min(len(text), mention_pos + len(breed_mention) + 50) | |
| context = text[context_start:context_end] | |
| # Find preference indicators | |
| max_score = 0.5 # Default score | |
| for indicator, score in self.preference_indicators.items(): | |
| if indicator in context: | |
| max_score = max(max_score, score) | |
| # Find order keywords | |
| for order_word, multiplier in self.order_keywords.items(): | |
| if order_word in context: | |
| max_score = max(max_score, max_score * multiplier) | |
| return max_score | |
| except Exception as e: | |
| print(f"Error finding preference score: {str(e)}") | |
| return 0.5 | |
| def extract_lifestyle_preferences(self, text: str) -> Dict[str, Dict[str, float]]: | |
| """ | |
| Extract lifestyle preferences from text | |
| Args: | |
| text: Input text | |
| Returns: | |
| Lifestyle preferences dictionary | |
| """ | |
| text = self.preprocess_text(text) | |
| preferences = {} | |
| try: | |
| for category, keywords_dict in self.lifestyle_keywords.items(): | |
| preferences[category] = {} | |
| for preference_type, keywords in keywords_dict.items(): | |
| score = 0.0 | |
| count = 0 | |
| for keyword in keywords: | |
| if keyword in text: | |
| # Calculate keyword occurrence intensity | |
| keyword_count = text.count(keyword) | |
| score += keyword_count | |
| count += keyword_count | |
| if count > 0: | |
| # Normalize score | |
| preferences[category][preference_type] = min(score / max(count, 1), 1.0) | |
| return preferences | |
| except Exception as e: | |
| print(f"Error extracting lifestyle preferences: {str(e)}") | |
| return {} | |
| def generate_search_keywords(self, text: str) -> List[str]: | |
| """ | |
| Generate keyword list for search | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of keywords | |
| """ | |
| text = self.preprocess_text(text) | |
| keywords = [] | |
| try: | |
| # Tokenize and filter stop words | |
| words = text.split() | |
| for word in words: | |
| if len(word) > 2 and word not in self.stop_words: | |
| keywords.append(word) | |
| # Extract important phrases | |
| phrases = self._extract_phrases(text) | |
| keywords.extend(phrases) | |
| # Remove duplicates | |
| keywords = list(set(keywords)) | |
| return keywords | |
| except Exception as e: | |
| print(f"Error generating search keywords: {str(e)}") | |
| return [] | |
| def _extract_phrases(self, text: str) -> List[str]: | |
| """ | |
| Extract important phrases | |
| Args: | |
| text: Input text | |
| Returns: | |
| List of phrases | |
| """ | |
| phrases = [] | |
| # Define important phrase patterns | |
| phrase_patterns = [ | |
| r'good with \w+', | |
| r'apartment \w+', | |
| r'family \w+', | |
| r'exercise \w+', | |
| r'grooming \w+', | |
| r'noise \w+', | |
| r'training \w+', | |
| r'health \w+', | |
| r'\w+ friendly', | |
| r'\w+ tolerant', | |
| r'\w+ maintenance', | |
| r'\w+ energy', | |
| r'\w+ barking', | |
| r'\w+ shedding' | |
| ] | |
| for pattern in phrase_patterns: | |
| matches = re.findall(pattern, text) | |
| phrases.extend(matches) | |
| return phrases | |
| def analyze_sentiment(self, text: str) -> Dict[str, float]: | |
| """ | |
| Analyze text sentiment | |
| Args: | |
| text: Input text | |
| Returns: | |
| Sentiment analysis results {'positive': 0.0-1.0, 'negative': 0.0-1.0, 'neutral': 0.0-1.0} | |
| """ | |
| text = self.preprocess_text(text) | |
| positive_words = [ | |
| 'love', 'like', 'want', 'prefer', 'good', 'great', 'excellent', | |
| 'perfect', 'ideal', 'wonderful', 'amazing', 'fantastic' | |
| ] | |
| negative_words = [ | |
| 'hate', 'dislike', 'bad', 'terrible', 'awful', 'horrible', | |
| 'not good', 'don\'t want', 'avoid', 'against', 'problem' | |
| ] | |
| positive_count = sum(1 for word in positive_words if word in text) | |
| negative_count = sum(1 for word in negative_words if word in text) | |
| total_words = len(text.split()) | |
| if total_words == 0: | |
| return {'positive': 0.0, 'negative': 0.0, 'neutral': 1.0} | |
| positive_ratio = positive_count / total_words | |
| negative_ratio = negative_count / total_words | |
| neutral_ratio = 1.0 - positive_ratio - negative_ratio | |
| return { | |
| 'positive': positive_ratio, | |
| 'negative': negative_ratio, | |
| 'neutral': max(0.0, neutral_ratio) | |
| } | |
| def extract_implicit_preferences(self, text: str) -> Dict[str, Any]: | |
| """ | |
| Extract implicit preferences from text | |
| Args: | |
| text: Input text | |
| Returns: | |
| Dictionary of implicit preferences | |
| """ | |
| text = self.preprocess_text(text) | |
| implicit_prefs = {} | |
| try: | |
| # Infer preferences from mentioned activities | |
| if any(activity in text for activity in ['hiking', 'running', 'jogging', 'outdoor']): | |
| implicit_prefs['exercise_needs'] = 'high' | |
| implicit_prefs['size_preference'] = 'medium_to_large' | |
| # Infer from living environment | |
| if any(env in text for env in ['apartment', 'small space', 'city']): | |
| implicit_prefs['size_preference'] = 'small_to_medium' | |
| implicit_prefs['noise_tolerance'] = 'low' | |
| implicit_prefs['exercise_needs'] = 'moderate' | |
| # Infer from family situation | |
| if 'children' in text or 'kids' in text: | |
| implicit_prefs['temperament'] = 'gentle_patient' | |
| implicit_prefs['good_with_children'] = True | |
| # Infer from experience level | |
| if any(exp in text for exp in ['first time', 'beginner', 'new to']): | |
| implicit_prefs['care_level'] = 'low_to_moderate' | |
| implicit_prefs['training_difficulty'] = 'easy' | |
| # Infer from time commitment | |
| if any(time in text for time in ['busy', 'no time', 'low maintenance']): | |
| implicit_prefs['grooming_needs'] = 'low' | |
| implicit_prefs['care_level'] = 'low' | |
| implicit_prefs['exercise_needs'] = 'low_to_moderate' | |
| return implicit_prefs | |
| except Exception as e: | |
| print(f"Error extracting implicit preferences: {str(e)}") | |
| return {} | |
| def validate_input(self, text: str) -> Dict[str, Any]: | |
| """ | |
| Validate input text validity | |
| Args: | |
| text: Input text | |
| Returns: | |
| Validation results dictionary | |
| """ | |
| if not text or not text.strip(): | |
| return { | |
| 'is_valid': False, | |
| 'error': 'Empty input', | |
| 'suggestions': ['Please provide a description of your preferences'] | |
| } | |
| text = text.strip() | |
| # Check length | |
| if len(text) < 10: | |
| return { | |
| 'is_valid': False, | |
| 'error': 'Input too short', | |
| 'suggestions': ['Please provide more details about your preferences'] | |
| } | |
| if len(text) > 1000: | |
| return { | |
| 'is_valid': False, | |
| 'error': 'Input too long', | |
| 'suggestions': ['Please provide a more concise description'] | |
| } | |
| # Check for meaningful content | |
| processed_text = self.preprocess_text(text) | |
| meaningful_words = [word for word in processed_text.split() | |
| if len(word) > 2 and word not in self.stop_words] | |
| if len(meaningful_words) < 3: | |
| return { | |
| 'is_valid': False, | |
| 'error': 'Not enough meaningful content', | |
| 'suggestions': ['Please provide more specific details about your lifestyle and preferences'] | |
| } | |
| return { | |
| 'is_valid': True, | |
| 'word_count': len(meaningful_words), | |
| 'suggestions': [] | |
| } | |
| def get_nlp_processor(): | |
| """Get natural language processor instance""" | |
| try: | |
| return NaturalLanguageProcessor() | |
| except Exception as e: | |
| print(f"Error creating NLP processor: {str(e)}") | |
| return None | |