Spaces:

DawnC
/

PawMatchAI

Running on Zero

App Files Files Community

PawMatchAI / natural_language_processor.py

DawnC

Upload 18 files

1e4c9bc verified 2 months ago

raw

history blame

17.6 kB

	import re
	import string
	from typing import Dict, List, Tuple, Optional, Any
	import traceback

	class NaturalLanguageProcessor:
	"""
	Natural language processing utility class
	Handles text preprocessing and keyword extraction for user input
	"""

	def __init__(self):
	"""Initialize the natural language processor"""
	self.stop_words = {
	'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
	'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
	'to', 'was', 'will', 'with', 'would', 'i', 'me', 'my', 'we', 'us',
	'our', 'you', 'your', 'they', 'them', 'their'
	}

	# Breed name mappings (common aliases to standard names)
	self.breed_aliases = {
	'lab': 'labrador_retriever',
	'labrador': 'labrador_retriever',
	'golden': 'golden_retriever',
	'retriever': ['labrador_retriever', 'golden_retriever'],
	'german shepherd': 'german_shepherd',
	'shepherd': 'german_shepherd',
	'border collie': 'border_collie',
	'collie': ['border_collie', 'collie'],
	'bulldog': ['french_bulldog', 'english_bulldog'],
	'french bulldog': 'french_bulldog',
	'poodle': ['standard_poodle', 'miniature_poodle', 'toy_poodle'],
	'husky': 'siberian_husky',
	'siberian husky': 'siberian_husky',
	'beagle': 'beagle',
	'yorkshire terrier': 'yorkshire_terrier',
	'yorkie': 'yorkshire_terrier',
	'chihuahua': 'chihuahua',
	'dachshund': 'dachshund',
	'wiener dog': 'dachshund',
	'rottweiler': 'rottweiler',
	'rottie': 'rottweiler',
	'boxer': 'boxer',
	'great dane': 'great_dane',
	'dane': 'great_dane',
	'mastiff': ['bull_mastiff', 'tibetan_mastiff'],
	'pitbull': 'american_staffordshire_terrier',
	'pit bull': 'american_staffordshire_terrier',
	'shih tzu': 'shih-tzu',
	'maltese': 'maltese_dog',
	'pug': 'pug',
	'basset hound': 'basset',
	'bloodhound': 'bloodhound',
	'australian shepherd': 'kelpie',
	'aussie': 'kelpie'
	}

	# Lifestyle keyword mappings
	self.lifestyle_keywords = {
	'living_space': {
	'apartment': ['apartment', 'flat', 'condo', 'small space', 'city living', 'urban'],
	'house': ['house', 'home', 'yard', 'garden', 'suburban', 'large space'],
	'farm': ['farm', 'rural', 'country', 'acreage', 'ranch']
	},
	'activity_level': {
	'very_high': ['very active', 'extremely energetic', 'marathon runner', 'athlete'],
	'high': ['active', 'energetic', 'exercise', 'hiking', 'running', 'outdoor activities',
	'sports', 'jogging', 'biking', 'adventure'],
	'moderate': ['moderate exercise', 'some activity', 'weekend walks', 'occasional exercise'],
	'low': ['calm', 'lazy', 'indoor', 'low energy', 'couch potato', 'sedentary', 'quiet lifestyle']
	},
	'family_situation': {
	'children': ['children', 'kids', 'toddlers', 'babies', 'family with children', 'young family'],
	'elderly': ['elderly', 'senior', 'old', 'retired', 'senior citizen'],
	'single': ['single', 'alone', 'individual', 'bachelor', 'solo'],
	'couple': ['couple', 'two people', 'pair', 'duo']
	},
	'noise_tolerance': {
	'low': ['quiet', 'silent', 'noise-sensitive', 'peaceful', 'no barking', 'minimal noise'],
	'moderate': ['some noise ok', 'moderate barking', 'normal noise'],
	'high': ['loud ok', 'barking fine', 'noise tolerant', 'doesn\'t mind noise']
	},
	'size_preference': {
	'small': ['small', 'tiny', 'little', 'compact', 'lap dog', 'petite', 'miniature'],
	'medium': ['medium', 'moderate size', 'average', 'mid-size'],
	'large': ['large', 'big', 'huge', 'giant', 'massive', 'substantial'],
	'varies': ['any size', 'size doesn\'t matter', 'flexible on size']
	},
	'experience_level': {
	'beginner': ['first time', 'beginner', 'new to dogs', 'inexperienced', 'never had'],
	'some': ['some experience', 'had dogs before', 'moderate experience'],
	'experienced': ['experienced', 'expert', 'very experienced', 'professional', 'trainer']
	},
	'grooming_commitment': {
	'low': ['low maintenance', 'easy care', 'minimal grooming', 'wash and go'],
	'moderate': ['moderate grooming', 'some brushing', 'regular care'],
	'high': ['high maintenance', 'lots of grooming', 'professional grooming', 'daily brushing']
	},
	'special_needs': {
	'guard': ['guard dog', 'protection', 'security', 'watchdog', 'guardian'],
	'therapy': ['therapy dog', 'emotional support', 'comfort', 'calm companion'],
	'hypoallergenic': ['hypoallergenic', 'allergies', 'non-shedding', 'allergy friendly'],
	'working': ['working dog', 'job', 'task', 'service dog'],
	'companion': ['companion', 'friend', 'buddy', 'lap dog', 'cuddle']
	}
	}

	# Comparative preference keywords
	self.preference_indicators = {
	'love': 1.0,
	'prefer': 0.9,
	'like': 0.8,
	'want': 0.8,
	'interested in': 0.7,
	'considering': 0.6,
	'ok with': 0.5,
	'don\'t mind': 0.4,
	'not interested': 0.2,
	'dislike': 0.1,
	'hate': 0.0
	}

	# Order keywords
	self.order_keywords = {
	'first': 1.0, 'most': 1.0, 'primary': 1.0, 'main': 1.0,
	'second': 0.8, 'then': 0.8, 'next': 0.8,
	'third': 0.6, 'also': 0.6, 'additionally': 0.6,
	'last': 0.4, 'least': 0.4, 'finally': 0.4
	}

	def preprocess_text(self, text: str) -> str:
	"""
	Text preprocessing

	Args:
	text: Raw text

	Returns:
	Preprocessed text
	"""
	if not text:
	return ""

	# Convert to lowercase
	text = text.lower().strip()

	# Remove punctuation (keep some meaningful ones)
	text = re.sub(r'[^\w\s\-\']', ' ', text)

	# Handle extra whitespace
	text = re.sub(r'\s+', ' ', text)

	return text

	def extract_breed_mentions(self, text: str) -> List[Tuple[str, float]]:
	"""
	Extract mentioned breeds and their preference levels from text

	Args:
	text: Input text

	Returns:
	List of (breed_name, preference_score) tuples
	"""
	text = self.preprocess_text(text)
	breed_mentions = []

	try:
	# Check each breed alias
	for alias, standard_breed in self.breed_aliases.items():
	if alias in text:
	# Find surrounding preference indicators
	preference_score = self._find_preference_score(text, alias)

	if isinstance(standard_breed, list):
	# If alias maps to multiple breeds, add all
	for breed in standard_breed:
	breed_mentions.append((breed, preference_score))
	else:
	breed_mentions.append((standard_breed, preference_score))

	# Deduplicate and merge scores
	breed_scores = {}
	for breed, score in breed_mentions:
	if breed in breed_scores:
	breed_scores[breed] = max(breed_scores[breed], score)
	else:
	breed_scores[breed] = score

	return list(breed_scores.items())

	except Exception as e:
	print(f"Error extracting breed mentions: {str(e)}")
	return []

	def _find_preference_score(self, text: str, breed_mention: str) -> float:
	"""
	Find preference score near breed mention

	Args:
	text: Text
	breed_mention: Breed mention

	Returns:
	Preference score (0.0-1.0)
	"""
	try:
	# Find breed mention position
	mention_pos = text.find(breed_mention)
	if mention_pos == -1:
	return 0.5 # Default neutral score

	# Check context (50 characters before and after)
	context_start = max(0, mention_pos - 50)
	context_end = min(len(text), mention_pos + len(breed_mention) + 50)
	context = text[context_start:context_end]

	# Find preference indicators
	max_score = 0.5 # Default score

	for indicator, score in self.preference_indicators.items():
	if indicator in context:
	max_score = max(max_score, score)

	# Find order keywords
	for order_word, multiplier in self.order_keywords.items():
	if order_word in context:
	max_score = max(max_score, max_score * multiplier)

	return max_score

	except Exception as e:
	print(f"Error finding preference score: {str(e)}")
	return 0.5

	def extract_lifestyle_preferences(self, text: str) -> Dict[str, Dict[str, float]]:
	"""
	Extract lifestyle preferences from text

	Args:
	text: Input text

	Returns:
	Lifestyle preferences dictionary
	"""
	text = self.preprocess_text(text)
	preferences = {}

	try:
	for category, keywords_dict in self.lifestyle_keywords.items():
	preferences[category] = {}

	for preference_type, keywords in keywords_dict.items():
	score = 0.0
	count = 0

	for keyword in keywords:
	if keyword in text:
	# Calculate keyword occurrence intensity
	keyword_count = text.count(keyword)
	score += keyword_count
	count += keyword_count

	if count > 0:
	# Normalize score
	preferences[category][preference_type] = min(score / max(count, 1), 1.0)

	return preferences

	except Exception as e:
	print(f"Error extracting lifestyle preferences: {str(e)}")
	return {}

	def generate_search_keywords(self, text: str) -> List[str]:
	"""
	Generate keyword list for search

	Args:
	text: Input text

	Returns:
	List of keywords
	"""
	text = self.preprocess_text(text)
	keywords = []

	try:
	# Tokenize and filter stop words
	words = text.split()
	for word in words:
	if len(word) > 2 and word not in self.stop_words:
	keywords.append(word)

	# Extract important phrases
	phrases = self._extract_phrases(text)
	keywords.extend(phrases)

	# Remove duplicates
	keywords = list(set(keywords))

	return keywords

	except Exception as e:
	print(f"Error generating search keywords: {str(e)}")
	return []

	def _extract_phrases(self, text: str) -> List[str]:
	"""
	Extract important phrases

	Args:
	text: Input text

	Returns:
	List of phrases
	"""
	phrases = []

	# Define important phrase patterns
	phrase_patterns = [
	r'good with \w+',
	r'apartment \w+',
	r'family \w+',
	r'exercise \w+',
	r'grooming \w+',
	r'noise \w+',
	r'training \w+',
	r'health \w+',
	r'\w+ friendly',
	r'\w+ tolerant',
	r'\w+ maintenance',
	r'\w+ energy',
	r'\w+ barking',
	r'\w+ shedding'
	]

	for pattern in phrase_patterns:
	matches = re.findall(pattern, text)
	phrases.extend(matches)

	return phrases

	def analyze_sentiment(self, text: str) -> Dict[str, float]:
	"""
	Analyze text sentiment

	Args:
	text: Input text

	Returns:
	Sentiment analysis results {'positive': 0.0-1.0, 'negative': 0.0-1.0, 'neutral': 0.0-1.0}
	"""
	text = self.preprocess_text(text)

	positive_words = [
	'love', 'like', 'want', 'prefer', 'good', 'great', 'excellent',
	'perfect', 'ideal', 'wonderful', 'amazing', 'fantastic'
	]

	negative_words = [
	'hate', 'dislike', 'bad', 'terrible', 'awful', 'horrible',
	'not good', 'don\'t want', 'avoid', 'against', 'problem'
	]

	positive_count = sum(1 for word in positive_words if word in text)
	negative_count = sum(1 for word in negative_words if word in text)
	total_words = len(text.split())

	if total_words == 0:
	return {'positive': 0.0, 'negative': 0.0, 'neutral': 1.0}

	positive_ratio = positive_count / total_words
	negative_ratio = negative_count / total_words
	neutral_ratio = 1.0 - positive_ratio - negative_ratio

	return {
	'positive': positive_ratio,
	'negative': negative_ratio,
	'neutral': max(0.0, neutral_ratio)
	}

	def extract_implicit_preferences(self, text: str) -> Dict[str, Any]:
	"""
	Extract implicit preferences from text

	Args:
	text: Input text

	Returns:
	Dictionary of implicit preferences
	"""
	text = self.preprocess_text(text)
	implicit_prefs = {}

	try:
	# Infer preferences from mentioned activities
	if any(activity in text for activity in ['hiking', 'running', 'jogging', 'outdoor']):
	implicit_prefs['exercise_needs'] = 'high'
	implicit_prefs['size_preference'] = 'medium_to_large'

	# Infer from living environment
	if any(env in text for env in ['apartment', 'small space', 'city']):
	implicit_prefs['size_preference'] = 'small_to_medium'
	implicit_prefs['noise_tolerance'] = 'low'
	implicit_prefs['exercise_needs'] = 'moderate'

	# Infer from family situation
	if 'children' in text or 'kids' in text:
	implicit_prefs['temperament'] = 'gentle_patient'
	implicit_prefs['good_with_children'] = True

	# Infer from experience level
	if any(exp in text for exp in ['first time', 'beginner', 'new to']):
	implicit_prefs['care_level'] = 'low_to_moderate'
	implicit_prefs['training_difficulty'] = 'easy'

	# Infer from time commitment
	if any(time in text for time in ['busy', 'no time', 'low maintenance']):
	implicit_prefs['grooming_needs'] = 'low'
	implicit_prefs['care_level'] = 'low'
	implicit_prefs['exercise_needs'] = 'low_to_moderate'

	return implicit_prefs

	except Exception as e:
	print(f"Error extracting implicit preferences: {str(e)}")
	return {}

	def validate_input(self, text: str) -> Dict[str, Any]:
	"""
	Validate input text validity

	Args:
	text: Input text

	Returns:
	Validation results dictionary
	"""
	if not text or not text.strip():
	return {
	'is_valid': False,
	'error': 'Empty input',
	'suggestions': ['Please provide a description of your preferences']
	}

	text = text.strip()

	# Check length
	if len(text) < 10:
	return {
	'is_valid': False,
	'error': 'Input too short',
	'suggestions': ['Please provide more details about your preferences']
	}

	if len(text) > 1000:
	return {
	'is_valid': False,
	'error': 'Input too long',
	'suggestions': ['Please provide a more concise description']
	}

	# Check for meaningful content
	processed_text = self.preprocess_text(text)
	meaningful_words = [word for word in processed_text.split()
	if len(word) > 2 and word not in self.stop_words]

	if len(meaningful_words) < 3:
	return {
	'is_valid': False,
	'error': 'Not enough meaningful content',
	'suggestions': ['Please provide more specific details about your lifestyle and preferences']
	}

	return {
	'is_valid': True,
	'word_count': len(meaningful_words),
	'suggestions': []
	}

	def get_nlp_processor():
	"""Get natural language processor instance"""
	try:
	return NaturalLanguageProcessor()
	except Exception as e:
	print(f"Error creating NLP processor: {str(e)}")
	return None