File size: 18,323 Bytes
f7d42c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 |
"""
Query Expansion System for CogniChat RAG Application
This module implements advanced query expansion techniques to improve retrieval quality:
- QueryAnalyzer: Extracts intent, entities, and keywords
- QueryRephraser: Generates natural language variations
- MultiQueryExpander: Creates diverse query formulations
- MultiHopReasoner: Connects concepts across documents
- FallbackStrategies: Handles edge cases gracefully
Author: CogniChat Team
Date: October 19, 2025
"""
import re
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from enum import Enum
class QueryStrategy(Enum):
"""Query expansion strategies with different complexity levels."""
QUICK = "quick" # 2 queries - fast, minimal expansion
BALANCED = "balanced" # 3-4 queries - good balance
COMPREHENSIVE = "comprehensive" # 5-6 queries - maximum coverage
@dataclass
class QueryAnalysis:
"""Results from query analysis."""
intent: str # question, definition, comparison, explanation, etc.
entities: List[str] # Named entities extracted
keywords: List[str] # Important keywords
complexity: str # simple, medium, complex
domain: Optional[str] = None # Technical domain if detected
@dataclass
class ExpandedQuery:
"""Container for expanded query variations."""
original: str
variations: List[str]
strategy_used: QueryStrategy
analysis: QueryAnalysis
class QueryAnalyzer:
"""
Analyzes queries to extract intent, entities, and key information.
Uses LLM-based analysis for intelligent query understanding.
"""
def __init__(self, llm=None):
"""
Initialize QueryAnalyzer.
Args:
llm: Optional LangChain LLM for advanced analysis
"""
self.llm = llm
self.intent_patterns = {
'definition': r'\b(what is|define|meaning of|definition)\b',
'how_to': r'\b(how to|how do|how can|steps to)\b',
'comparison': r'\b(compare|difference|versus|vs|better than)\b',
'explanation': r'\b(why|explain|reason|cause)\b',
'listing': r'\b(list|enumerate|what are|types of)\b',
'example': r'\b(example|instance|sample|case)\b',
}
def analyze(self, query: str) -> QueryAnalysis:
"""
Analyze query to extract intent, entities, and keywords.
Args:
query: User's original query
Returns:
QueryAnalysis object with extracted information
"""
query_lower = query.lower()
# Detect intent
intent = self._detect_intent(query_lower)
# Extract entities (simplified - can be enhanced with NER)
entities = self._extract_entities(query)
# Extract keywords
keywords = self._extract_keywords(query)
# Assess complexity
complexity = self._assess_complexity(query, entities, keywords)
# Detect domain
domain = self._detect_domain(query_lower)
return QueryAnalysis(
intent=intent,
entities=entities,
keywords=keywords,
complexity=complexity,
domain=domain
)
def _detect_intent(self, query_lower: str) -> str:
"""Detect query intent using pattern matching."""
for intent, pattern in self.intent_patterns.items():
if re.search(pattern, query_lower):
return intent
return 'general'
def _extract_entities(self, query: str) -> List[str]:
"""Extract named entities (simplified version)."""
# Look for capitalized words (potential entities)
words = query.split()
entities = []
for word in words:
# Skip common words at sentence start
if word[0].isupper() and word.lower() not in ['what', 'how', 'why', 'when', 'where', 'which']:
entities.append(word)
# Look for quoted terms
quoted = re.findall(r'"([^"]+)"', query)
entities.extend(quoted)
return list(set(entities))
def _extract_keywords(self, query: str) -> List[str]:
"""Extract important keywords from query."""
# Remove stop words (simplified list)
stop_words = {
'a', 'an', 'the', 'is', 'are', 'was', 'were', 'be', 'been',
'what', 'how', 'why', 'when', 'where', 'which', 'who',
'do', 'does', 'did', 'can', 'could', 'should', 'would',
'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by'
}
# Split and filter
words = re.findall(r'\b\w+\b', query.lower())
keywords = [w for w in words if w not in stop_words and len(w) > 2]
return keywords[:10] # Limit to top 10
def _assess_complexity(self, query: str, entities: List[str], keywords: List[str]) -> str:
"""Assess query complexity."""
word_count = len(query.split())
entity_count = len(entities)
keyword_count = len(keywords)
# Simple scoring
score = word_count + (entity_count * 2) + (keyword_count * 1.5)
if score < 15:
return 'simple'
elif score < 30:
return 'medium'
else:
return 'complex'
def _detect_domain(self, query_lower: str) -> Optional[str]:
"""Detect technical domain if present."""
domains = {
'programming': ['code', 'function', 'class', 'variable', 'algorithm', 'debug'],
'data_science': ['model', 'dataset', 'training', 'prediction', 'accuracy'],
'machine_learning': ['neural', 'network', 'learning', 'ai', 'deep learning'],
'web': ['html', 'css', 'javascript', 'api', 'frontend', 'backend'],
'database': ['sql', 'query', 'database', 'table', 'index'],
'security': ['encryption', 'authentication', 'vulnerability', 'attack'],
}
for domain, keywords in domains.items():
if any(kw in query_lower for kw in keywords):
return domain
return None
class QueryRephraser:
"""
Generates natural language variations of queries using multiple strategies.
"""
def __init__(self, llm=None):
"""
Initialize QueryRephraser.
Args:
llm: LangChain LLM for generating variations
"""
self.llm = llm
def generate_variations(
self,
query: str,
analysis: QueryAnalysis,
strategy: QueryStrategy = QueryStrategy.BALANCED
) -> List[str]:
"""
Generate query variations based on strategy.
Args:
query: Original query
analysis: Query analysis results
strategy: Expansion strategy to use
Returns:
List of query variations
"""
variations = [query] # Always include original
if strategy == QueryStrategy.QUICK:
# Just add synonym variation
variations.append(self._synonym_variation(query, analysis))
elif strategy == QueryStrategy.BALANCED:
# Add synonym, expanded, and simplified versions
variations.append(self._synonym_variation(query, analysis))
variations.append(self._expanded_variation(query, analysis))
variations.append(self._simplified_variation(query, analysis))
elif strategy == QueryStrategy.COMPREHENSIVE:
# Add all variations
variations.append(self._synonym_variation(query, analysis))
variations.append(self._expanded_variation(query, analysis))
variations.append(self._simplified_variation(query, analysis))
variations.append(self._keyword_focused(query, analysis))
variations.append(self._context_variation(query, analysis))
# Add one more: alternate phrasing
if analysis.intent in ['how_to', 'explanation']:
variations.append(f"Guide to {' '.join(analysis.keywords[:3])}")
# Remove duplicates and None values
variations = [v for v in variations if v]
return list(dict.fromkeys(variations)) # Preserve order, remove dupes
def _synonym_variation(self, query: str, analysis: QueryAnalysis) -> str:
"""Generate variation using synonyms."""
# Common synonym replacements
synonyms = {
'error': 'issue',
'problem': 'issue',
'fix': 'resolve',
'use': 'utilize',
'create': 'generate',
'make': 'create',
'get': 'retrieve',
'show': 'display',
'find': 'locate',
'explain': 'describe',
}
words = query.lower().split()
for i, word in enumerate(words):
if word in synonyms:
words[i] = synonyms[word]
break # Only replace one word to keep natural
return ' '.join(words).capitalize()
def _expanded_variation(self, query: str, analysis: QueryAnalysis) -> str:
"""Generate expanded version with more detail."""
if analysis.intent == 'definition':
return f"Detailed explanation and definition of {' '.join(analysis.keywords)}"
elif analysis.intent == 'how_to':
return f"Step-by-step guide on {query.lower()}"
elif analysis.intent == 'comparison':
return f"Comprehensive comparison: {query}"
else:
# Add qualifying words
return f"Detailed information about {query.lower()}"
def _simplified_variation(self, query: str, analysis: QueryAnalysis) -> str:
"""Generate simplified version focusing on core concepts."""
# Use just the keywords
if len(analysis.keywords) >= 2:
return ' '.join(analysis.keywords[:3])
return query
def _keyword_focused(self, query: str, analysis: QueryAnalysis) -> str:
"""Create keyword-focused variation for BM25."""
keywords = analysis.keywords + analysis.entities
return ' '.join(keywords[:5])
def _context_variation(self, query: str, analysis: QueryAnalysis) -> str:
"""Add contextual information if domain detected."""
if analysis.domain:
return f"{query} in {analysis.domain} context"
return query
class MultiQueryExpander:
"""
Main query expansion orchestrator that combines analysis and rephrasing.
"""
def __init__(self, llm=None):
"""
Initialize MultiQueryExpander.
Args:
llm: LangChain LLM for advanced expansions
"""
self.analyzer = QueryAnalyzer(llm)
self.rephraser = QueryRephraser(llm)
def expand(
self,
query: str,
strategy: QueryStrategy = QueryStrategy.BALANCED,
max_queries: int = 6
) -> ExpandedQuery:
"""
Expand query into multiple variations.
Args:
query: Original user query
strategy: Expansion strategy
max_queries: Maximum number of queries to generate
Returns:
ExpandedQuery object with all variations
"""
# Analyze query
analysis = self.analyzer.analyze(query)
# Generate variations
variations = self.rephraser.generate_variations(query, analysis, strategy)
# Limit to max_queries
variations = variations[:max_queries]
return ExpandedQuery(
original=query,
variations=variations,
strategy_used=strategy,
analysis=analysis
)
class MultiHopReasoner:
"""
Implements multi-hop reasoning to connect concepts across documents.
Useful for complex queries that require information from multiple sources.
"""
def __init__(self, llm=None):
"""
Initialize MultiHopReasoner.
Args:
llm: LangChain LLM for reasoning
"""
self.llm = llm
def generate_sub_queries(self, query: str, analysis: QueryAnalysis) -> List[str]:
"""
Break complex query into sub-queries for multi-hop reasoning.
Args:
query: Original complex query
analysis: Query analysis
Returns:
List of sub-queries
"""
sub_queries = [query]
# For comparison queries, create separate queries for each entity
if analysis.intent == 'comparison' and len(analysis.entities) >= 2:
for entity in analysis.entities[:2]:
sub_queries.append(f"Information about {entity}")
elif analysis.intent == 'comparison' and len(analysis.keywords) >= 2:
# Fallback: use keywords if no entities found
for keyword in analysis.keywords[:2]:
sub_queries.append(f"Information about {keyword}")
# For how-to queries, break into steps
if analysis.intent == 'how_to' and len(analysis.keywords) >= 2:
main_topic = ' '.join(analysis.keywords[:2])
sub_queries.append(f"Prerequisites for {main_topic}")
sub_queries.append(f"Steps to {main_topic}")
# For complex questions, create focused sub-queries
if analysis.complexity == 'complex' and len(analysis.keywords) > 3:
# Create queries focusing on different keyword groups
mid = len(analysis.keywords) // 2
sub_queries.append(' '.join(analysis.keywords[:mid]))
sub_queries.append(' '.join(analysis.keywords[mid:]))
return sub_queries[:5] # Limit to 5 sub-queries
class FallbackStrategies:
"""
Implements fallback strategies for queries that don't retrieve good results.
"""
@staticmethod
def simplify_query(query: str) -> str:
"""Simplify query by removing modifiers and focusing on core terms."""
# Remove question words
query = re.sub(r'\b(what|how|why|when|where|which|who|can|could|should|would)\b', '', query, flags=re.IGNORECASE)
# Remove common phrases
query = re.sub(r'\b(is|are|was|were|be|been|the|a|an)\b', '', query, flags=re.IGNORECASE)
# Clean up extra spaces
query = re.sub(r'\s+', ' ', query).strip()
return query
@staticmethod
def broaden_query(query: str, analysis: QueryAnalysis) -> str:
"""Broaden query to increase recall."""
# Remove specific constraints
query = re.sub(r'\b(specific|exactly|precisely|only|just)\b', '', query, flags=re.IGNORECASE)
# Add general terms
if analysis.keywords:
return f"{analysis.keywords[0]} overview"
return query
@staticmethod
def focus_entities(analysis: QueryAnalysis) -> str:
"""Create entity-focused query as fallback."""
if analysis.entities:
return ' '.join(analysis.entities)
elif analysis.keywords:
return ' '.join(analysis.keywords[:3])
return ""
# Convenience function for easy integration
def expand_query_simple(
query: str,
strategy: str = "balanced",
llm=None
) -> List[str]:
"""
Simple function to expand a query without dealing with classes.
Args:
query: User's query to expand
strategy: "quick", "balanced", or "comprehensive"
llm: Optional LangChain LLM
Returns:
List of expanded query variations
Example:
>>> queries = expand_query_simple("How do I debug Python code?", strategy="balanced")
>>> print(queries)
['How do I debug Python code?', 'How do I resolve Python code?', ...]
"""
expander = MultiQueryExpander(llm=llm)
strategy_enum = QueryStrategy(strategy)
expanded = expander.expand(query, strategy=strategy_enum)
return expanded.variations
# Example usage and testing
if __name__ == "__main__":
# Example 1: Simple query expansion
print("=" * 60)
print("Example 1: Simple Query Expansion")
print("=" * 60)
query = "What is machine learning?"
queries = expand_query_simple(query, strategy="balanced")
print(f"\nOriginal: {query}")
print(f"\nExpanded queries ({len(queries)}):")
for i, q in enumerate(queries, 1):
print(f" {i}. {q}")
# Example 2: Complex query with full analysis
print("\n" + "=" * 60)
print("Example 2: Complex Query with Analysis")
print("=" * 60)
expander = MultiQueryExpander()
query = "How do I compare the performance of different neural network architectures?"
result = expander.expand(query, strategy=QueryStrategy.COMPREHENSIVE)
print(f"\nOriginal: {result.original}")
print(f"\nAnalysis:")
print(f" Intent: {result.analysis.intent}")
print(f" Entities: {result.analysis.entities}")
print(f" Keywords: {result.analysis.keywords}")
print(f" Complexity: {result.analysis.complexity}")
print(f" Domain: {result.analysis.domain}")
print(f"\nExpanded queries ({len(result.variations)}):")
for i, q in enumerate(result.variations, 1):
print(f" {i}. {q}")
# Example 3: Multi-hop reasoning
print("\n" + "=" * 60)
print("Example 3: Multi-Hop Reasoning")
print("=" * 60)
reasoner = MultiHopReasoner()
analyzer = QueryAnalyzer()
query = "Compare Python and Java for web development"
analysis = analyzer.analyze(query)
sub_queries = reasoner.generate_sub_queries(query, analysis)
print(f"\nOriginal: {query}")
print(f"\nSub-queries for multi-hop reasoning:")
for i, sq in enumerate(sub_queries, 1):
print(f" {i}. {sq}")
# Example 4: Fallback strategies
print("\n" + "=" * 60)
print("Example 4: Fallback Strategies")
print("=" * 60)
query = "What is the specific difference between supervised and unsupervised learning?"
analysis = analyzer.analyze(query)
|