Spaces:
Sleeping
Sleeping
Commit
·
21446aa
0
Parent(s):
Init commit
Browse files- .DS_Store +0 -0
- .dockerignore +4 -0
- .gitattributes +35 -0
- .gitignore +2 -0
- .huggingface.yml +4 -0
- Dockerfile +45 -0
- README.md +116 -0
- api/README.md +85 -0
- api/__init__.py +2 -0
- api/app.py +49 -0
- api/chatbot.py +202 -0
- api/config.py +67 -0
- api/retrieval.py +155 -0
- api/routes.py +434 -0
- main.py +13 -0
- memory/__init__.py +2 -0
- memory/memory.py +331 -0
- models/__init__.py +3 -0
- models/download_model.py +51 -0
- models/llama.py +125 -0
- models/summarizer.py +216 -0
- models/warmup.py +8 -0
- requirements.txt +25 -0
- search/.DS_Store +0 -0
- search/__init__.py +26 -0
- search/coordinator.py +504 -0
- search/engines/__init__.py +6 -0
- search/engines/cooking.py +197 -0
- search/engines/duckduckgo.py +599 -0
- search/engines/multilingual.py +272 -0
- search/engines/video.py +432 -0
- search/extractors/__init__.py +3 -0
- search/extractors/content.py +211 -0
- search/processors/__init__.py +6 -0
- search/processors/cooking.py +258 -0
- search/processors/enhanced.py +331 -0
- search/processors/language.py +266 -0
- search/processors/sources.py +352 -0
- search/search.py +362 -0
- utils/__init__.py +4 -0
- utils/migrate.py +54 -0
- utils/symbipredict_2022.csv +0 -0
- utils/translation.py +141 -0
- utils/vlm.py +54 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.dockerignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
api/legacy.py
|
| 2 |
+
*.md
|
| 3 |
+
.env
|
| 4 |
+
*yml
|
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
secrets.toml
|
.huggingface.yml
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
sdk: docker
|
| 2 |
+
app_file: app.py
|
| 3 |
+
port: 7860
|
| 4 |
+
hardware: cpu-basic
|
Dockerfile
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11
|
| 2 |
+
|
| 3 |
+
# Create and use a non-root user (optional)
|
| 4 |
+
RUN useradd -m -u 1000 user
|
| 5 |
+
USER user
|
| 6 |
+
ENV PATH="/home/user/.local/bin:$PATH"
|
| 7 |
+
|
| 8 |
+
# Set working directory
|
| 9 |
+
WORKDIR /app
|
| 10 |
+
|
| 11 |
+
# Copy all project files to the container
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
# Install dependencies
|
| 15 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
+
|
| 17 |
+
# Verify CSV file is present and accessible
|
| 18 |
+
RUN ls -la /app/utils/symbipredict_2022.csv || echo "CSV file not found"
|
| 19 |
+
|
| 20 |
+
# Test CSV loading in Docker environment
|
| 21 |
+
RUN python /app/test_docker_csv.py
|
| 22 |
+
|
| 23 |
+
# Clean up test file
|
| 24 |
+
RUN rm /app/test_docker_csv.py
|
| 25 |
+
|
| 26 |
+
# Set Hugging Face cache directory to persist model downloads
|
| 27 |
+
ENV HF_HOME="/home/user/.cache/huggingface"
|
| 28 |
+
ENV SENTENCE_TRANSFORMERS_HOME="/home/user/.cache/huggingface/sentence-transformers"
|
| 29 |
+
ENV MEDGEMMA_HOME="/home/user/.cache/huggingface/sentence-transformers"
|
| 30 |
+
|
| 31 |
+
# Create cache directories and ensure permissions
|
| 32 |
+
RUN mkdir -p /app/model_cache /home/user/.cache/huggingface/sentence-transformers && \
|
| 33 |
+
chown -R user:user /app/model_cache /home/user/.cache/huggingface
|
| 34 |
+
|
| 35 |
+
# Pre-load model in a separate script
|
| 36 |
+
RUN python /app/models/download_model.py && python /app/models/warmup.py
|
| 37 |
+
|
| 38 |
+
# Ensure ownership and permissions remain intact
|
| 39 |
+
RUN chown -R user:user /app/model_cache
|
| 40 |
+
|
| 41 |
+
# Expose port
|
| 42 |
+
EXPOSE 7860
|
| 43 |
+
|
| 44 |
+
# Run the application using main.py as entry point
|
| 45 |
+
CMD ["python", "main.py"]
|
README.md
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Cooking Tutor API
|
| 3 |
+
emoji: 👨🍳
|
| 4 |
+
colorFrom: orange
|
| 5 |
+
colorTo: red
|
| 6 |
+
sdk: docker
|
| 7 |
+
sdk_version: latest
|
| 8 |
+
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
short_description: Cooking Tutor with WebSearch, Memory, Multilingual
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# Cooking Tutor Backend
|
| 14 |
+
|
| 15 |
+
## At-a-glance
|
| 16 |
+
Production-grade cooking assistant with web search integration, conversation memory, multilingual support, and comprehensive recipe guidance.
|
| 17 |
+
|
| 18 |
+
## Key Features
|
| 19 |
+
|
| 20 |
+
### 🔍 Web Search Integration
|
| 21 |
+
- Curated cooking sources (AllRecipes, Food Network, Epicurious, etc.)
|
| 22 |
+
- Content extraction and summarization
|
| 23 |
+
- Citation mapping with clickable URLs
|
| 24 |
+
- Cooking relevance filtering
|
| 25 |
+
|
| 26 |
+
### 🧠 Memory & Retrieval
|
| 27 |
+
- Conversation memory with FAISS indexing
|
| 28 |
+
- Semantic chunking and summarization
|
| 29 |
+
- Context builder for conversation continuity
|
| 30 |
+
- Up to 20 recent summaries per user
|
| 31 |
+
|
| 32 |
+
### 🌍 Multilingual Support
|
| 33 |
+
- Vietnamese and Chinese translation
|
| 34 |
+
- Language detection and query enhancement
|
| 35 |
+
- Fallback handling for translation failures
|
| 36 |
+
|
| 37 |
+
### 🍳 Cooking Focus
|
| 38 |
+
- Specialized cooking keyword filtering
|
| 39 |
+
- Recipe and technique guidance
|
| 40 |
+
- Ingredient substitution suggestions
|
| 41 |
+
- Cooking time and temperature guidance
|
| 42 |
+
|
| 43 |
+
## Usage
|
| 44 |
+
|
| 45 |
+
### Running the Application
|
| 46 |
+
```bash
|
| 47 |
+
# Using main entry point
|
| 48 |
+
python main.py
|
| 49 |
+
|
| 50 |
+
# Or directly
|
| 51 |
+
python api/app.py
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
### Environment Variables
|
| 55 |
+
- `FlashAPI` - Gemini API key (required)
|
| 56 |
+
- `NVIDIA_URI` - Optional for advanced features
|
| 57 |
+
- `NVIDIA_RERANK_ENDPOINT` - Optional reranker endpoint
|
| 58 |
+
|
| 59 |
+
## API Endpoints
|
| 60 |
+
|
| 61 |
+
### POST `/chat`
|
| 62 |
+
Main chat endpoint with cooking guidance.
|
| 63 |
+
|
| 64 |
+
**Request Body:**
|
| 65 |
+
```json
|
| 66 |
+
{
|
| 67 |
+
"query": "How to make perfect pasta?",
|
| 68 |
+
"lang": "EN",
|
| 69 |
+
"search": true,
|
| 70 |
+
"user_id": "unique_user_id",
|
| 71 |
+
"servings": 4,
|
| 72 |
+
"dietary": ["vegetarian"],
|
| 73 |
+
"skill_level": "beginner",
|
| 74 |
+
"structured": true
|
| 75 |
+
}
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
**Response:**
|
| 79 |
+
```json
|
| 80 |
+
{
|
| 81 |
+
"response": "Cooking guidance with citations <URL>",
|
| 82 |
+
"response_time": "2.34s"
|
| 83 |
+
}
|
| 84 |
+
```
|
| 85 |
+
|
| 86 |
+
## Search Mode Features
|
| 87 |
+
|
| 88 |
+
When `search: true`:
|
| 89 |
+
1. Search curated cooking sources
|
| 90 |
+
2. Extract and summarize relevant content
|
| 91 |
+
3. Filter by cooking relevance
|
| 92 |
+
4. Provide citations with clickable URLs
|
| 93 |
+
|
| 94 |
+
## Memory Features
|
| 95 |
+
|
| 96 |
+
- **Conversation Continuity**: Maintains context across sessions
|
| 97 |
+
- **Semantic Chunking**: Groups related cooking topics
|
| 98 |
+
- **Usage Tracking**: Prioritizes frequently used information
|
| 99 |
+
- **Time Decay**: Recent conversations get higher priority
|
| 100 |
+
|
| 101 |
+
## Folders Overview
|
| 102 |
+
- `api/` - FastAPI app, routes, chatbot orchestration
|
| 103 |
+
- `models/` - Summarizer and processing models
|
| 104 |
+
- `memory/` - Memory manager and FAISS interfaces
|
| 105 |
+
- `search/` - Web search engines and processors
|
| 106 |
+
- `utils/` - Translation and utility functions
|
| 107 |
+
|
| 108 |
+
## Dependencies
|
| 109 |
+
|
| 110 |
+
See `requirements.txt` for complete list. Key components:
|
| 111 |
+
- `google-genai` - Gemini API integration
|
| 112 |
+
- `faiss-cpu` - Vector similarity search
|
| 113 |
+
- `sentence-transformers` - Text embeddings
|
| 114 |
+
- `transformers` - Translation models
|
| 115 |
+
- `requests` - Web search functionality
|
| 116 |
+
- `beautifulsoup4` - HTML content extraction
|
api/README.md
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API Module Structure - Cooking Tutor
|
| 2 |
+
|
| 3 |
+
## 📁 **Module Overview**
|
| 4 |
+
|
| 5 |
+
### **config.py** - Configuration Management
|
| 6 |
+
- Environment variables validation
|
| 7 |
+
- Logging configuration
|
| 8 |
+
- System resource monitoring
|
| 9 |
+
- Memory optimization settings
|
| 10 |
+
- CORS configuration
|
| 11 |
+
|
| 12 |
+
### **retrieval.py** - Web Search Integration
|
| 13 |
+
- Cooking information retrieval via web search
|
| 14 |
+
- Recipe suggestion system
|
| 15 |
+
- Smart content filtering and relevance scoring
|
| 16 |
+
- Web search result processing
|
| 17 |
+
|
| 18 |
+
### **chatbot.py** - Core Chatbot Logic
|
| 19 |
+
- CookingTutorChatbot class
|
| 20 |
+
- Gemini API client
|
| 21 |
+
- Web search integration
|
| 22 |
+
- Citation processing
|
| 23 |
+
- Memory management integration
|
| 24 |
+
|
| 25 |
+
### **routes.py** - API Endpoints
|
| 26 |
+
- `/chat` - Main chat endpoint
|
| 27 |
+
- `/health` - Health check
|
| 28 |
+
- `/` - Root endpoint with landing page
|
| 29 |
+
- Request/response handling
|
| 30 |
+
|
| 31 |
+
### **app.py** - Main Application
|
| 32 |
+
- FastAPI app initialization
|
| 33 |
+
- Middleware configuration
|
| 34 |
+
- Route registration
|
| 35 |
+
- Server startup
|
| 36 |
+
|
| 37 |
+
## 🔄 **Data Flow**
|
| 38 |
+
|
| 39 |
+
```
|
| 40 |
+
Request → routes.py → chatbot.py → search.py (web search)
|
| 41 |
+
↓
|
| 42 |
+
memory.py (context) + utils/ (translation)
|
| 43 |
+
↓
|
| 44 |
+
models/ (summarization processing)
|
| 45 |
+
↓
|
| 46 |
+
Response with citations
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
## 🚀 **Benefits of Modular Structure**
|
| 50 |
+
|
| 51 |
+
1. **Separation of Concerns**: Each module has a single responsibility
|
| 52 |
+
2. **Easier Testing**: Individual modules can be tested in isolation
|
| 53 |
+
3. **Better Maintainability**: Changes to one module don't affect others
|
| 54 |
+
4. **Improved Readability**: Smaller files are easier to understand
|
| 55 |
+
5. **Reusability**: Modules can be imported and used elsewhere
|
| 56 |
+
6. **Scalability**: Easy to add new features without affecting existing code
|
| 57 |
+
|
| 58 |
+
## 📊 **File Sizes**
|
| 59 |
+
|
| 60 |
+
| File | Lines | Purpose |
|
| 61 |
+
|------|-------|---------|
|
| 62 |
+
| **app.py** | 50 | Main app initialization |
|
| 63 |
+
| **config.py** | 68 | Configuration |
|
| 64 |
+
| **retrieval.py** | 156 | Web search integration |
|
| 65 |
+
| **chatbot.py** | 203 | Chatbot logic |
|
| 66 |
+
| **routes.py** | 435 | API endpoints |
|
| 67 |
+
|
| 68 |
+
## 🔧 **Usage**
|
| 69 |
+
|
| 70 |
+
The modular structure maintains clean API interface:
|
| 71 |
+
|
| 72 |
+
```python
|
| 73 |
+
# All imports work the same way
|
| 74 |
+
from api.app import app
|
| 75 |
+
from api.chatbot import CookingTutorChatbot
|
| 76 |
+
from api.retrieval import retrieval_engine
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
## 🛠 **Development Benefits**
|
| 80 |
+
|
| 81 |
+
- **Easier Debugging**: Issues can be isolated to specific modules
|
| 82 |
+
- **Parallel Development**: Multiple developers can work on different modules
|
| 83 |
+
- **Code Reviews**: Smaller files are easier to review
|
| 84 |
+
- **Documentation**: Each module can have focused documentation
|
| 85 |
+
- **Testing**: Unit tests can be written for each module independently
|
api/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API package
|
| 2 |
+
# Main API endpoints and routes
|
api/app.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# api/app_new.py
|
| 2 |
+
import uvicorn
|
| 3 |
+
from fastapi import FastAPI
|
| 4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
+
from .config import setup_logging, check_system_resources, optimize_memory, CORS_ORIGINS, validate_environment
|
| 6 |
+
from .routes import router
|
| 7 |
+
|
| 8 |
+
# ✅ Validate environment
|
| 9 |
+
validate_environment()
|
| 10 |
+
|
| 11 |
+
# ✅ Setup logging
|
| 12 |
+
logger = setup_logging()
|
| 13 |
+
logger.info("🍳 Starting Cooking Tutor API...")
|
| 14 |
+
|
| 15 |
+
# ✅ Monitor system resources
|
| 16 |
+
check_system_resources(logger)
|
| 17 |
+
|
| 18 |
+
# ✅ Optimize memory usage
|
| 19 |
+
optimize_memory()
|
| 20 |
+
|
| 21 |
+
# ✅ Initialize FastAPI app
|
| 22 |
+
app = FastAPI(
|
| 23 |
+
title="Cooking Tutor API",
|
| 24 |
+
description="AI-powered cooking lesson and recipe tutoring with web search",
|
| 25 |
+
version="1.0.0"
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# ✅ Add CORS middleware
|
| 29 |
+
app.add_middleware(
|
| 30 |
+
CORSMiddleware,
|
| 31 |
+
allow_origins=CORS_ORIGINS,
|
| 32 |
+
allow_credentials=True,
|
| 33 |
+
allow_methods=["*"],
|
| 34 |
+
allow_headers=["*"],
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# No database initialization required for cooking tutor (web-search only)
|
| 38 |
+
|
| 39 |
+
# ✅ Include routes
|
| 40 |
+
app.include_router(router)
|
| 41 |
+
|
| 42 |
+
# ✅ Run Uvicorn
|
| 43 |
+
if __name__ == "__main__":
|
| 44 |
+
logger.info("[System] ✅ Starting FastAPI Server...")
|
| 45 |
+
try:
|
| 46 |
+
uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error(f"❌ Server Startup Failed: {e}")
|
| 49 |
+
exit(1)
|
api/chatbot.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# api/chatbot.py
|
| 2 |
+
import re
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Dict
|
| 5 |
+
from google import genai
|
| 6 |
+
from .config import gemini_flash_api_key
|
| 7 |
+
from memory import MemoryManager
|
| 8 |
+
from utils import translate_query
|
| 9 |
+
from search import search_comprehensive
|
| 10 |
+
# Safety guard removed - cooking tutor doesn't need medical safety checks
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger("cooking-tutor")
|
| 13 |
+
|
| 14 |
+
class GeminiClient:
|
| 15 |
+
"""Gemini API client for generating responses"""
|
| 16 |
+
|
| 17 |
+
def __init__(self):
|
| 18 |
+
self.client = genai.Client(api_key=gemini_flash_api_key)
|
| 19 |
+
|
| 20 |
+
def generate_content(self, prompt: str, model: str = "gemini-2.5-flash", temperature: float = 0.7) -> str:
|
| 21 |
+
"""Generate content using Gemini API"""
|
| 22 |
+
try:
|
| 23 |
+
response = self.client.models.generate_content(model=model, contents=prompt)
|
| 24 |
+
return response.text
|
| 25 |
+
except Exception as e:
|
| 26 |
+
logger.error(f"[LLM] ❌ Error calling Gemini API: {e}")
|
| 27 |
+
return "Error generating response from Gemini."
|
| 28 |
+
|
| 29 |
+
class CookingTutorChatbot:
|
| 30 |
+
"""Cooking tutor chatbot that uses only web search + memory."""
|
| 31 |
+
|
| 32 |
+
def __init__(self, model_name: str):
|
| 33 |
+
self.model_name = model_name
|
| 34 |
+
self.gemini_client = GeminiClient()
|
| 35 |
+
self.memory = MemoryManager()
|
| 36 |
+
|
| 37 |
+
def chat(
|
| 38 |
+
self,
|
| 39 |
+
user_id: str,
|
| 40 |
+
user_query: str,
|
| 41 |
+
lang: str = "EN",
|
| 42 |
+
search_mode: bool = True,
|
| 43 |
+
video_mode: bool = False,
|
| 44 |
+
servings: int = None,
|
| 45 |
+
dietary: list = None,
|
| 46 |
+
allergens: list = None,
|
| 47 |
+
equipment: list = None,
|
| 48 |
+
time_limit_minutes: int = None,
|
| 49 |
+
skill_level: str = None,
|
| 50 |
+
cuisine: str = None,
|
| 51 |
+
structured: bool = False,
|
| 52 |
+
) -> str:
|
| 53 |
+
# Translate to English-centric search if needed
|
| 54 |
+
if lang.upper() in {"VI", "ZH"}:
|
| 55 |
+
user_query = translate_query(user_query, lang.lower())
|
| 56 |
+
|
| 57 |
+
# Basic cooking relevance check
|
| 58 |
+
cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing', 'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner']
|
| 59 |
+
query_lower = user_query.lower()
|
| 60 |
+
if not any(keyword in query_lower for keyword in cooking_keywords):
|
| 61 |
+
logger.warning(f"[SAFETY] Non-cooking query detected: {user_query}")
|
| 62 |
+
return "⚠️ I'm a cooking tutor! Please ask me about recipes, cooking techniques, ingredients, or anything food-related."
|
| 63 |
+
|
| 64 |
+
# Conversation memory (recent turns)
|
| 65 |
+
contextual_chunks = self.memory.get_contextual_chunks(user_id, user_query, lang)
|
| 66 |
+
|
| 67 |
+
# Web search context
|
| 68 |
+
search_context = ""
|
| 69 |
+
url_mapping = {}
|
| 70 |
+
source_aggregation = {}
|
| 71 |
+
video_results = []
|
| 72 |
+
|
| 73 |
+
if search_mode:
|
| 74 |
+
try:
|
| 75 |
+
search_context, url_mapping, source_aggregation = search_comprehensive(
|
| 76 |
+
f"cooking technique tutorial: {user_query}",
|
| 77 |
+
num_results=12,
|
| 78 |
+
target_language=lang,
|
| 79 |
+
include_videos=bool(video_mode)
|
| 80 |
+
)
|
| 81 |
+
if video_mode and source_aggregation:
|
| 82 |
+
video_results = source_aggregation.get('sources', []) or []
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.error(f"[SEARCH] Failed: {e}")
|
| 85 |
+
|
| 86 |
+
# Build prompt
|
| 87 |
+
parts = [
|
| 88 |
+
"You are a professional cooking tutor and recipe coach.",
|
| 89 |
+
"Provide step-by-step, practical instructions with exact measurements, temperatures, and timings.",
|
| 90 |
+
"Offer substitutions, variations, pantry-friendly swaps, and troubleshooting tips.",
|
| 91 |
+
"Adapt guidance to different skill levels (beginner/intermediate/advanced).",
|
| 92 |
+
"Use Markdown with headings, numbered steps, bullet lists, and short paragraphs.",
|
| 93 |
+
"Always include a concise Ingredients list when relevant.",
|
| 94 |
+
"Cite sources inline using <#ID> tags already present in the search context when applicable.",
|
| 95 |
+
]
|
| 96 |
+
|
| 97 |
+
# Constraints block
|
| 98 |
+
constraints = []
|
| 99 |
+
if servings:
|
| 100 |
+
constraints.append(f"Servings: {servings}")
|
| 101 |
+
if dietary:
|
| 102 |
+
constraints.append(f"Dietary preferences: {', '.join(dietary)}")
|
| 103 |
+
if allergens:
|
| 104 |
+
constraints.append(f"Avoid allergens: {', '.join(allergens)}")
|
| 105 |
+
if equipment:
|
| 106 |
+
constraints.append(f"Available equipment: {', '.join(equipment)}")
|
| 107 |
+
if time_limit_minutes:
|
| 108 |
+
constraints.append(f"Time limit: {time_limit_minutes} minutes")
|
| 109 |
+
if skill_level:
|
| 110 |
+
constraints.append(f"Skill level: {skill_level}")
|
| 111 |
+
if cuisine:
|
| 112 |
+
constraints.append(f"Cuisine: {cuisine}")
|
| 113 |
+
|
| 114 |
+
if constraints:
|
| 115 |
+
parts.append("Constraints to respect:\n- " + "\n- ".join(constraints))
|
| 116 |
+
|
| 117 |
+
if contextual_chunks:
|
| 118 |
+
parts.append("Relevant context from previous messages:\n" + contextual_chunks)
|
| 119 |
+
if search_context:
|
| 120 |
+
parts.append("Cooking knowledge from the web (with citations):\n" + search_context)
|
| 121 |
+
|
| 122 |
+
parts.append(f"User's cooking question: {user_query}")
|
| 123 |
+
parts.append(f"Language to generate answer: {lang}")
|
| 124 |
+
|
| 125 |
+
if structured:
|
| 126 |
+
parts.append(
|
| 127 |
+
"Return a Markdown response with these sections if relevant:"
|
| 128 |
+
"\n1. Title"
|
| 129 |
+
"\n2. Summary (2-3 sentences)"
|
| 130 |
+
"\n3. Ingredients (quantities in metric and US units)"
|
| 131 |
+
"\n4. Equipment"
|
| 132 |
+
"\n5. Step-by-step Instructions (numbered)"
|
| 133 |
+
"\n6. Timing & Temperatures"
|
| 134 |
+
"\n7. Variations & Substitutions"
|
| 135 |
+
"\n8. Troubleshooting & Doneness Cues"
|
| 136 |
+
"\n9. Storage & Reheating"
|
| 137 |
+
"\n10. Sources"
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
prompt = "\n\n".join(parts)
|
| 141 |
+
response = self.gemini_client.generate_content(prompt, model=self.model_name, temperature=0.6)
|
| 142 |
+
|
| 143 |
+
# Process citations
|
| 144 |
+
if url_mapping:
|
| 145 |
+
response = self._process_citations(response, url_mapping)
|
| 146 |
+
|
| 147 |
+
# Basic cooking relevance check for response
|
| 148 |
+
if response and len(response) > 50:
|
| 149 |
+
response_lower = response.lower()
|
| 150 |
+
if not any(keyword in response_lower for keyword in cooking_keywords):
|
| 151 |
+
logger.warning(f"[SAFETY] Non-cooking response detected, redirecting to cooking topic")
|
| 152 |
+
response = "⚠️ Let's stick to cooking-related topics. Try asking about recipes, techniques, or ingredients!"
|
| 153 |
+
|
| 154 |
+
if user_id:
|
| 155 |
+
self.memory.add_exchange(user_id, user_query, response, lang=lang)
|
| 156 |
+
|
| 157 |
+
if video_mode and video_results:
|
| 158 |
+
return {
|
| 159 |
+
'text': response.strip(),
|
| 160 |
+
'videos': video_results
|
| 161 |
+
}
|
| 162 |
+
return response.strip()
|
| 163 |
+
|
| 164 |
+
def _process_citations(self, response: str, url_mapping: Dict[int, str]) -> str:
|
| 165 |
+
"""Replace citation tags with actual URLs, handling both single and multiple references"""
|
| 166 |
+
|
| 167 |
+
# Pattern to match both single citations <#1> and multiple citations <#1, #2, #5, #7, #9>
|
| 168 |
+
citation_pattern = r'<#([^>]+)>'
|
| 169 |
+
|
| 170 |
+
def replace_citation(match):
|
| 171 |
+
citation_content = match.group(1)
|
| 172 |
+
# Split by comma and clean up each citation ID
|
| 173 |
+
citation_ids = [id_str.strip() for id_str in citation_content.split(',')]
|
| 174 |
+
|
| 175 |
+
urls = []
|
| 176 |
+
for citation_id in citation_ids:
|
| 177 |
+
try:
|
| 178 |
+
doc_id = int(citation_id)
|
| 179 |
+
if doc_id in url_mapping:
|
| 180 |
+
url = url_mapping[doc_id]
|
| 181 |
+
urls.append(f'<{url}>')
|
| 182 |
+
logger.info(f"[CITATION] Replacing <#{doc_id}> with {url}")
|
| 183 |
+
else:
|
| 184 |
+
logger.warning(f"[CITATION] No URL mapping found for document ID {doc_id}")
|
| 185 |
+
urls.append(f'<#{doc_id}>') # Keep original if URL not found
|
| 186 |
+
except ValueError:
|
| 187 |
+
logger.warning(f"[CITATION] Invalid citation ID: {citation_id}")
|
| 188 |
+
urls.append(f'<#{citation_id}>') # Keep original if invalid
|
| 189 |
+
|
| 190 |
+
# Join multiple URLs with spaces
|
| 191 |
+
return ' '.join(urls)
|
| 192 |
+
|
| 193 |
+
# Replace citations with URLs
|
| 194 |
+
processed_response = re.sub(citation_pattern, replace_citation, response)
|
| 195 |
+
|
| 196 |
+
# Count total citations processed
|
| 197 |
+
citations_found = re.findall(citation_pattern, response)
|
| 198 |
+
total_citations = sum(len([id_str.strip() for id_str in citation_content.split(',')])
|
| 199 |
+
for citation_content in citations_found)
|
| 200 |
+
|
| 201 |
+
logger.info(f"[CITATION] Processed {total_citations} citations from {len(citations_found)} citation groups, {len(url_mapping)} URL mappings available")
|
| 202 |
+
return processed_response
|
api/config.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# api/config.py
|
| 2 |
+
import os
|
| 3 |
+
import logging
|
| 4 |
+
import psutil
|
| 5 |
+
from typing import List
|
| 6 |
+
|
| 7 |
+
# ✅ Environment Variables
|
| 8 |
+
gemini_flash_api_key = os.getenv("FlashAPI")
|
| 9 |
+
|
| 10 |
+
# Validate environment endpoint (only when actually running the app)
|
| 11 |
+
def validate_environment():
|
| 12 |
+
if not gemini_flash_api_key:
|
| 13 |
+
raise ValueError("❌ Missing FlashAPI key for Gemini. Set env var FlashAPI.")
|
| 14 |
+
|
| 15 |
+
# ✅ Logging Configuration
|
| 16 |
+
def setup_logging():
|
| 17 |
+
"""Configure logging for the application"""
|
| 18 |
+
# Silence noisy loggers
|
| 19 |
+
for name in [
|
| 20 |
+
"uvicorn.error", "uvicorn.access",
|
| 21 |
+
"fastapi", "starlette",
|
| 22 |
+
"pymongo", "gridfs",
|
| 23 |
+
"sentence_transformers", "faiss",
|
| 24 |
+
"google", "google.auth",
|
| 25 |
+
]:
|
| 26 |
+
logging.getLogger(name).setLevel(logging.WARNING)
|
| 27 |
+
|
| 28 |
+
logging.basicConfig(
|
| 29 |
+
level=logging.INFO,
|
| 30 |
+
format="%(asctime)s — %(name)s — %(levelname)s — %(message)s",
|
| 31 |
+
force=True
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
logger = logging.getLogger("cooking-tutor")
|
| 35 |
+
logger.setLevel(logging.DEBUG)
|
| 36 |
+
return logger
|
| 37 |
+
|
| 38 |
+
# ✅ System Resource Monitoring
|
| 39 |
+
def check_system_resources(logger):
|
| 40 |
+
"""Monitor system resources and log warnings"""
|
| 41 |
+
memory = psutil.virtual_memory()
|
| 42 |
+
cpu = psutil.cpu_percent(interval=1)
|
| 43 |
+
disk = psutil.disk_usage("/")
|
| 44 |
+
|
| 45 |
+
logger.info(f"[System] 🔍 System Resources - RAM: {memory.percent}%, CPU: {cpu}%, Disk: {disk.percent}%")
|
| 46 |
+
|
| 47 |
+
if memory.percent > 85:
|
| 48 |
+
logger.warning("⚠️ High RAM usage detected!")
|
| 49 |
+
if cpu > 90:
|
| 50 |
+
logger.warning("⚠️ High CPU usage detected!")
|
| 51 |
+
if disk.percent > 90:
|
| 52 |
+
logger.warning("⚠️ High Disk usage detected!")
|
| 53 |
+
|
| 54 |
+
# ✅ Memory Optimization
|
| 55 |
+
def optimize_memory():
|
| 56 |
+
"""Set environment variables for memory optimization"""
|
| 57 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
| 58 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
| 59 |
+
|
| 60 |
+
# ✅ CORS Configuration
|
| 61 |
+
CORS_ORIGINS = [
|
| 62 |
+
"http://localhost:5173", # Vite dev server
|
| 63 |
+
"http://localhost:3000", # Another vercel local dev
|
| 64 |
+
"https://cooking-tutor.vercel.app", # ✅ Vercel frontend production URL
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
# No embedding/RAG models used in cooking tutor
|
api/retrieval.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# api/retrieval.py
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
import time
|
| 5 |
+
import requests
|
| 6 |
+
import numpy as np
|
| 7 |
+
import logging
|
| 8 |
+
from typing import List, Dict
|
| 9 |
+
# Database removed - cooking tutor uses web search only
|
| 10 |
+
from models import summarizer
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger("retrieval-bot")
|
| 13 |
+
|
| 14 |
+
class RetrievalEngine:
|
| 15 |
+
def __init__(self):
|
| 16 |
+
# Database removed - cooking tutor uses web search only
|
| 17 |
+
self._reranker = None
|
| 18 |
+
|
| 19 |
+
def _get_reranker(self):
|
| 20 |
+
"""Initialize the NVIDIA reranker on first use."""
|
| 21 |
+
if self._reranker is None:
|
| 22 |
+
self._reranker = _NvidiaReranker()
|
| 23 |
+
return self._reranker
|
| 24 |
+
|
| 25 |
+
@staticmethod
|
| 26 |
+
def _is_cooking_guide_text(text: str) -> bool:
|
| 27 |
+
"""Heuristic to detect cooking guide content."""
|
| 28 |
+
if not text:
|
| 29 |
+
return False
|
| 30 |
+
keywords = [
|
| 31 |
+
# common cooking guide indicators
|
| 32 |
+
r"\bguideline(s)?\b", r"\bcooking practice\b", r"\brecommend(ation|ed|s)?\b",
|
| 33 |
+
r"\bshould\b", r"\bmust\b", r"\bstrongly (recommend|suggest)\b",
|
| 34 |
+
r"\brecipe\b", r"\btechnique\b", r"\bmethod\b", r"\binstruction\b",
|
| 35 |
+
r"\btemperature\b", r"\btiming\b", r"\bmeasurement\b"
|
| 36 |
+
]
|
| 37 |
+
text_lc = text.lower()
|
| 38 |
+
return any(re.search(p, text_lc, flags=re.IGNORECASE) for p in keywords)
|
| 39 |
+
|
| 40 |
+
@staticmethod
|
| 41 |
+
def _extract_cooking_guide_sentences(text: str) -> str:
|
| 42 |
+
"""Extract likely cooking guide sentences to reduce conversational/noisy content before summarization."""
|
| 43 |
+
if not text:
|
| 44 |
+
return ""
|
| 45 |
+
sentences = re.split(r"(?<=[.!?])\s+", text)
|
| 46 |
+
keep_patterns = [
|
| 47 |
+
r"\b(recommend|should|must|preferred|first-choice|consider)\b",
|
| 48 |
+
r"\b(temperature|timing|measurement|portion|serving)\b",
|
| 49 |
+
r"\b(ingredient|seasoning|spice|herb|sauce|marinade)\b",
|
| 50 |
+
r"\b(prepare|cook|bake|roast|grill|fry|boil|steam)\b"
|
| 51 |
+
]
|
| 52 |
+
kept = []
|
| 53 |
+
for s in sentences:
|
| 54 |
+
s_norm = s.strip()
|
| 55 |
+
if not s_norm:
|
| 56 |
+
continue
|
| 57 |
+
if any(re.search(p, s_norm, flags=re.IGNORECASE) for p in keep_patterns):
|
| 58 |
+
kept.append(s_norm)
|
| 59 |
+
# Fallback: if filtering too aggressive, keep truncated original
|
| 60 |
+
if not kept:
|
| 61 |
+
return text[:1200]
|
| 62 |
+
return " ".join(kept)[:2000]
|
| 63 |
+
|
| 64 |
+
def retrieve_cooking_info(self, query: str, k: int = 5, min_sim: float = 0.8) -> list:
|
| 65 |
+
"""
|
| 66 |
+
Retrieve cooking information - placeholder for web search integration
|
| 67 |
+
"""
|
| 68 |
+
# This method is kept for compatibility but cooking tutor uses web search
|
| 69 |
+
logger.info(f"[Retrieval] Cooking info retrieval requested for: {query}")
|
| 70 |
+
return [""]
|
| 71 |
+
|
| 72 |
+
def retrieve_recipe_suggestions(self, ingredient_text: str, top_k: int = 5, min_sim: float = 0.5) -> list:
|
| 73 |
+
"""
|
| 74 |
+
Retrieve recipe suggestions from ingredients - placeholder for web search integration
|
| 75 |
+
"""
|
| 76 |
+
# This method is kept for compatibility but cooking tutor uses web search
|
| 77 |
+
logger.info(f"[Retrieval] Recipe suggestions requested for ingredients: {ingredient_text}")
|
| 78 |
+
return [""]
|
| 79 |
+
|
| 80 |
+
# Global retrieval engine instance
|
| 81 |
+
retrieval_engine = RetrievalEngine()
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
class _NvidiaReranker:
|
| 85 |
+
"""Simple client for NVIDIA NIM reranking: nvidia/rerank-qa-mistral-4b"""
|
| 86 |
+
def __init__(self):
|
| 87 |
+
self.api_key = os.getenv("NVIDIA_URI")
|
| 88 |
+
# Use provider doc model identifier
|
| 89 |
+
self.model = os.getenv("NVIDIA_RERANK_MODEL", "nv-rerank-qa-mistral-4b:1")
|
| 90 |
+
# NIM rerank endpoint (subject to environment); keep configurable
|
| 91 |
+
self.base_url = os.getenv("NVIDIA_RERANK_ENDPOINT", "https://ai.api.nvidia.com/v1/retrieval/nvidia/reranking")
|
| 92 |
+
self.timeout_s = 30
|
| 93 |
+
|
| 94 |
+
def rerank(self, query: str, documents: List[str]) -> List[Dict]:
|
| 95 |
+
if not self.api_key:
|
| 96 |
+
raise ValueError("NVIDIA_URI not set for reranker")
|
| 97 |
+
if not documents:
|
| 98 |
+
return []
|
| 99 |
+
headers = {
|
| 100 |
+
"Authorization": f"Bearer {self.api_key}",
|
| 101 |
+
"Content-Type": "application/json",
|
| 102 |
+
"Accept": "application/json",
|
| 103 |
+
}
|
| 104 |
+
# Truncate and limit candidates to avoid 4xx
|
| 105 |
+
docs = documents[:10]
|
| 106 |
+
docs = [d[:2000] for d in docs if isinstance(d, str)]
|
| 107 |
+
# Two payload shapes based on provider doc
|
| 108 |
+
payloads = [
|
| 109 |
+
{
|
| 110 |
+
"model": self.model,
|
| 111 |
+
"query": {"text": query},
|
| 112 |
+
"passages": [{"text": d} for d in docs],
|
| 113 |
+
},
|
| 114 |
+
{
|
| 115 |
+
"model": self.model,
|
| 116 |
+
"query": query,
|
| 117 |
+
"documents": [{"text": d} for d in docs],
|
| 118 |
+
},
|
| 119 |
+
]
|
| 120 |
+
try:
|
| 121 |
+
data = None
|
| 122 |
+
for p in payloads:
|
| 123 |
+
resp = requests.post(self.base_url, headers=headers, json=p, timeout=self.timeout_s)
|
| 124 |
+
if resp.status_code >= 400:
|
| 125 |
+
# try next shape
|
| 126 |
+
continue
|
| 127 |
+
data = resp.json()
|
| 128 |
+
break
|
| 129 |
+
if data is None:
|
| 130 |
+
# last attempt for diagnostics
|
| 131 |
+
resp.raise_for_status()
|
| 132 |
+
# Expecting a list with scores and indices or texts
|
| 133 |
+
results = []
|
| 134 |
+
entries = data.get("results") or data.get("data") or []
|
| 135 |
+
if isinstance(entries, list) and entries:
|
| 136 |
+
for entry in entries:
|
| 137 |
+
# Common patterns: {index, score} or {text, score}
|
| 138 |
+
idx = entry.get("index")
|
| 139 |
+
text = entry.get("text") if entry.get("text") else (documents[idx] if idx is not None and idx < len(documents) else None)
|
| 140 |
+
score = entry.get("score", 0)
|
| 141 |
+
if text:
|
| 142 |
+
results.append({"text": text, "score": float(score)})
|
| 143 |
+
else:
|
| 144 |
+
# Fallback: if API returns scores aligned to input order
|
| 145 |
+
scores = data.get("scores")
|
| 146 |
+
if isinstance(scores, list) and len(scores) == len(documents):
|
| 147 |
+
for t, s in zip(documents, scores):
|
| 148 |
+
results.append({"text": t, "score": float(s)})
|
| 149 |
+
# Sort by score desc
|
| 150 |
+
results.sort(key=lambda x: x.get("score", 0), reverse=True)
|
| 151 |
+
return results
|
| 152 |
+
except Exception as e:
|
| 153 |
+
logger.warning(f"[Reranker] Failed calling NVIDIA reranker: {e}")
|
| 154 |
+
# On failure, return original order with neutral scores
|
| 155 |
+
return [{"text": d, "score": 0.0} for d in documents]
|
api/routes.py
ADDED
|
@@ -0,0 +1,434 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# api/routes.py
|
| 2 |
+
import time
|
| 3 |
+
import os
|
| 4 |
+
import re
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
import uuid
|
| 8 |
+
from datetime import datetime, timedelta
|
| 9 |
+
from fastapi import APIRouter, Request
|
| 10 |
+
from fastapi.responses import JSONResponse, HTMLResponse
|
| 11 |
+
from .chatbot import CookingTutorChatbot
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger("routes")
|
| 14 |
+
|
| 15 |
+
# Create router
|
| 16 |
+
router = APIRouter()
|
| 17 |
+
|
| 18 |
+
# Initialize cooking tutor chatbot
|
| 19 |
+
chatbot = CookingTutorChatbot(
|
| 20 |
+
model_name="gemini-2.5-flash"
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
@router.post("/chat")
|
| 24 |
+
async def chat_endpoint(req: Request):
|
| 25 |
+
"""Chat endpoint (web-search only). No DB persistence, no image handling."""
|
| 26 |
+
body = await req.json()
|
| 27 |
+
user_id = body.get("user_id", "anonymous")
|
| 28 |
+
query_raw = body.get("query")
|
| 29 |
+
query = query_raw.strip() if isinstance(query_raw, str) else ""
|
| 30 |
+
lang = body.get("lang", "EN")
|
| 31 |
+
search_mode = body.get("search", True)
|
| 32 |
+
video_mode = body.get("video", False)
|
| 33 |
+
# Optional cooking constraints
|
| 34 |
+
servings = body.get("servings")
|
| 35 |
+
dietary = body.get("dietary") # e.g., ["vegetarian", "gluten-free"]
|
| 36 |
+
allergens = body.get("allergens") # e.g., ["peanuts", "shellfish"]
|
| 37 |
+
equipment = body.get("equipment") # e.g., ["oven", "cast iron skillet"]
|
| 38 |
+
time_limit = body.get("time_limit_minutes") # e.g., 30
|
| 39 |
+
skill_level = body.get("skill_level") # beginner|intermediate|advanced
|
| 40 |
+
cuisine = body.get("cuisine") # e.g., "Italian"
|
| 41 |
+
structured = body.get("structured", False)
|
| 42 |
+
|
| 43 |
+
start = time.time()
|
| 44 |
+
try:
|
| 45 |
+
answer = chatbot.chat(
|
| 46 |
+
user_id,
|
| 47 |
+
query,
|
| 48 |
+
lang,
|
| 49 |
+
search_mode,
|
| 50 |
+
video_mode,
|
| 51 |
+
servings=servings,
|
| 52 |
+
dietary=dietary,
|
| 53 |
+
allergens=allergens,
|
| 54 |
+
equipment=equipment,
|
| 55 |
+
time_limit_minutes=time_limit,
|
| 56 |
+
skill_level=skill_level,
|
| 57 |
+
cuisine=cuisine,
|
| 58 |
+
structured=structured,
|
| 59 |
+
)
|
| 60 |
+
elapsed = time.time() - start
|
| 61 |
+
|
| 62 |
+
# Handle response format (might be string or dict with videos)
|
| 63 |
+
if isinstance(answer, dict):
|
| 64 |
+
response_text = answer.get('text', '')
|
| 65 |
+
video_data = answer.get('videos', [])
|
| 66 |
+
else:
|
| 67 |
+
response_text = answer
|
| 68 |
+
video_data = []
|
| 69 |
+
|
| 70 |
+
# Final response
|
| 71 |
+
response_data = {"response": f"{response_text}\n\n(Response time: {elapsed:.2f}s)"}
|
| 72 |
+
|
| 73 |
+
# Include video data if available
|
| 74 |
+
if video_data:
|
| 75 |
+
response_data["videos"] = video_data
|
| 76 |
+
|
| 77 |
+
return JSONResponse(response_data)
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.error(f"[REQUEST] Error processing request: {e}")
|
| 81 |
+
return JSONResponse({"response": "❌ Failed to get a response. Please try again."})
|
| 82 |
+
|
| 83 |
+
@router.get("/check-request/{request_id}")
|
| 84 |
+
async def check_request_status(request_id: str):
|
| 85 |
+
"""Legacy endpoint kept for compatibility; returns not supported."""
|
| 86 |
+
return JSONResponse({"status": "unsupported"})
|
| 87 |
+
|
| 88 |
+
@router.get("/pending-requests/{user_id}")
|
| 89 |
+
async def get_pending_requests(user_id: str):
|
| 90 |
+
"""Legacy endpoint kept for compatibility; returns empty list."""
|
| 91 |
+
return JSONResponse({"requests": []})
|
| 92 |
+
|
| 93 |
+
@router.delete("/cleanup-requests")
|
| 94 |
+
async def cleanup_old_requests():
|
| 95 |
+
"""Legacy endpoint kept for compatibility; no-op."""
|
| 96 |
+
return JSONResponse({"deleted_count": 0})
|
| 97 |
+
|
| 98 |
+
@router.get("/health")
|
| 99 |
+
async def health_check():
|
| 100 |
+
"""Health check endpoint"""
|
| 101 |
+
return {"status": "healthy", "service": "cooking-tutor"}
|
| 102 |
+
|
| 103 |
+
@router.get("/")
|
| 104 |
+
async def root():
|
| 105 |
+
"""Root endpoint - Landing page with redirect to main app"""
|
| 106 |
+
|
| 107 |
+
html_content = """
|
| 108 |
+
<!DOCTYPE html>
|
| 109 |
+
<html lang="en">
|
| 110 |
+
<head>
|
| 111 |
+
<meta charset="UTF-8">
|
| 112 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 113 |
+
<title>Cooking Tutor API</title>
|
| 114 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
|
| 115 |
+
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
|
| 116 |
+
<style>
|
| 117 |
+
* {
|
| 118 |
+
margin: 0;
|
| 119 |
+
padding: 0;
|
| 120 |
+
box-sizing: border-box;
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
body {
|
| 124 |
+
font-family: 'Inter', sans-serif;
|
| 125 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 126 |
+
min-height: 100vh;
|
| 127 |
+
display: flex;
|
| 128 |
+
align-items: center;
|
| 129 |
+
justify-content: center;
|
| 130 |
+
overflow: hidden;
|
| 131 |
+
position: relative;
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
/* Animated background particles */
|
| 135 |
+
.particles {
|
| 136 |
+
position: absolute;
|
| 137 |
+
top: 0;
|
| 138 |
+
left: 0;
|
| 139 |
+
width: 100%;
|
| 140 |
+
height: 100%;
|
| 141 |
+
overflow: hidden;
|
| 142 |
+
z-index: 1;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
.particle {
|
| 146 |
+
position: absolute;
|
| 147 |
+
background: rgba(255, 255, 255, 0.1);
|
| 148 |
+
border-radius: 50%;
|
| 149 |
+
animation: float 6s ease-in-out infinite;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
.particle:nth-child(1) { width: 80px; height: 80px; top: 20%; left: 10%; animation-delay: 0s; }
|
| 153 |
+
.particle:nth-child(2) { width: 120px; height: 120px; top: 60%; left: 80%; animation-delay: 2s; }
|
| 154 |
+
.particle:nth-child(3) { width: 60px; height: 60px; top: 80%; left: 20%; animation-delay: 4s; }
|
| 155 |
+
.particle:nth-child(4) { width: 100px; height: 100px; top: 10%; left: 70%; animation-delay: 1s; }
|
| 156 |
+
.particle:nth-child(5) { width: 90px; height: 90px; top: 40%; left: 50%; animation-delay: 3s; }
|
| 157 |
+
|
| 158 |
+
@keyframes float {
|
| 159 |
+
0%, 100% { transform: translateY(0px) rotate(0deg); opacity: 0.7; }
|
| 160 |
+
50% { transform: translateY(-20px) rotate(180deg); opacity: 1; }
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
.container {
|
| 164 |
+
background: rgba(255, 255, 255, 0.1);
|
| 165 |
+
backdrop-filter: blur(20px);
|
| 166 |
+
border: 1px solid rgba(255, 255, 255, 0.2);
|
| 167 |
+
border-radius: 24px;
|
| 168 |
+
padding: 3rem 2rem;
|
| 169 |
+
text-align: center;
|
| 170 |
+
box-shadow: 0 20px 40px rgba(0, 0, 0, 0.1);
|
| 171 |
+
max-width: 500px;
|
| 172 |
+
width: 90%;
|
| 173 |
+
position: relative;
|
| 174 |
+
z-index: 2;
|
| 175 |
+
animation: slideUp 0.8s ease-out;
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
@keyframes slideUp {
|
| 179 |
+
from {
|
| 180 |
+
opacity: 0;
|
| 181 |
+
transform: translateY(50px);
|
| 182 |
+
}
|
| 183 |
+
to {
|
| 184 |
+
opacity: 1;
|
| 185 |
+
transform: translateY(0);
|
| 186 |
+
}
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
.logo {
|
| 190 |
+
width: 80px;
|
| 191 |
+
height: 80px;
|
| 192 |
+
background: linear-gradient(135deg, #f59e0b 0%, #ef4444 100%);
|
| 193 |
+
border-radius: 20px;
|
| 194 |
+
display: flex;
|
| 195 |
+
align-items: center;
|
| 196 |
+
justify-content: center;
|
| 197 |
+
margin: 0 auto 1.5rem;
|
| 198 |
+
animation: pulse 2s ease-in-out infinite;
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
@keyframes pulse {
|
| 202 |
+
0%, 100% { transform: scale(1); }
|
| 203 |
+
50% { transform: scale(1.05); }
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
.logo i {
|
| 207 |
+
font-size: 2rem;
|
| 208 |
+
color: white;
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
h1 {
|
| 212 |
+
color: white;
|
| 213 |
+
font-size: 2.5rem;
|
| 214 |
+
font-weight: 700;
|
| 215 |
+
margin-bottom: 0.5rem;
|
| 216 |
+
background: linear-gradient(135deg, #ffffff 0%, #f0f9ff 100%);
|
| 217 |
+
-webkit-background-clip: text;
|
| 218 |
+
-webkit-text-fill-color: transparent;
|
| 219 |
+
background-clip: text;
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
.subtitle {
|
| 223 |
+
color: rgba(255, 255, 255, 0.8);
|
| 224 |
+
font-size: 1.1rem;
|
| 225 |
+
margin-bottom: 2rem;
|
| 226 |
+
font-weight: 400;
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
.version {
|
| 230 |
+
color: rgba(255, 255, 255, 0.6);
|
| 231 |
+
font-size: 0.9rem;
|
| 232 |
+
margin-bottom: 2rem;
|
| 233 |
+
font-weight: 300;
|
| 234 |
+
}
|
| 235 |
+
|
| 236 |
+
.redirect-btn {
|
| 237 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 238 |
+
color: white;
|
| 239 |
+
border: none;
|
| 240 |
+
padding: 1rem 2rem;
|
| 241 |
+
border-radius: 12px;
|
| 242 |
+
font-size: 1.1rem;
|
| 243 |
+
font-weight: 600;
|
| 244 |
+
cursor: pointer;
|
| 245 |
+
transition: all 0.3s ease;
|
| 246 |
+
text-decoration: none;
|
| 247 |
+
display: inline-flex;
|
| 248 |
+
align-items: center;
|
| 249 |
+
gap: 0.5rem;
|
| 250 |
+
box-shadow: 0 8px 20px rgba(102, 126, 234, 0.3);
|
| 251 |
+
position: relative;
|
| 252 |
+
overflow: hidden;
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
.redirect-btn::before {
|
| 256 |
+
content: '';
|
| 257 |
+
position: absolute;
|
| 258 |
+
top: 0;
|
| 259 |
+
left: -100%;
|
| 260 |
+
width: 100%;
|
| 261 |
+
height: 100%;
|
| 262 |
+
background: linear-gradient(90deg, transparent, rgba(255, 255, 255, 0.2), transparent);
|
| 263 |
+
transition: left 0.5s;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
.redirect-btn:hover::before {
|
| 267 |
+
left: 100%;
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
.redirect-btn:hover {
|
| 271 |
+
transform: translateY(-2px);
|
| 272 |
+
box-shadow: 0 12px 30px rgba(102, 126, 234, 0.4);
|
| 273 |
+
}
|
| 274 |
+
|
| 275 |
+
.redirect-btn:active {
|
| 276 |
+
transform: translateY(0);
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
.redirect-btn i {
|
| 280 |
+
font-size: 1.2rem;
|
| 281 |
+
transition: transform 0.3s ease;
|
| 282 |
+
}
|
| 283 |
+
|
| 284 |
+
.redirect-btn:hover i {
|
| 285 |
+
transform: translateX(3px);
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
.features {
|
| 289 |
+
margin-top: 2rem;
|
| 290 |
+
display: grid;
|
| 291 |
+
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
|
| 292 |
+
gap: 1rem;
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
.feature {
|
| 296 |
+
color: rgba(255, 255, 255, 0.7);
|
| 297 |
+
font-size: 0.9rem;
|
| 298 |
+
font-weight: 500;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
.feature i {
|
| 302 |
+
display: block;
|
| 303 |
+
font-size: 1.5rem;
|
| 304 |
+
margin-bottom: 0.5rem;
|
| 305 |
+
color: rgba(255, 255, 255, 0.9);
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
@media (max-width: 768px) {
|
| 309 |
+
.container {
|
| 310 |
+
padding: 2rem 1.5rem;
|
| 311 |
+
margin: 1rem;
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
h1 {
|
| 315 |
+
font-size: 2rem;
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
.subtitle {
|
| 319 |
+
font-size: 1rem;
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
+
.redirect-btn {
|
| 323 |
+
padding: 0.8rem 1.5rem;
|
| 324 |
+
font-size: 1rem;
|
| 325 |
+
}
|
| 326 |
+
}
|
| 327 |
+
</style>
|
| 328 |
+
</head>
|
| 329 |
+
<body>
|
| 330 |
+
<div class="particles">
|
| 331 |
+
<div class="particle"></div>
|
| 332 |
+
<div class="particle"></div>
|
| 333 |
+
<div class="particle"></div>
|
| 334 |
+
<div class="particle"></div>
|
| 335 |
+
<div class="particle"></div>
|
| 336 |
+
</div>
|
| 337 |
+
|
| 338 |
+
<div class="container">
|
| 339 |
+
<div class="logo">
|
| 340 |
+
<i class="fas fa-utensils"></i>
|
| 341 |
+
</div>
|
| 342 |
+
|
| 343 |
+
<h1>Cooking Tutor</h1>
|
| 344 |
+
<p class="subtitle">AI-Powered Cooking Lessons & Recipe Guidance</p>
|
| 345 |
+
<p class="version">API Version 1.0.0</p>
|
| 346 |
+
|
| 347 |
+
<a href="/" class="redirect-btn" target="_blank">
|
| 348 |
+
<i class="fas fa-external-link-alt"></i>
|
| 349 |
+
Open Frontend
|
| 350 |
+
</a>
|
| 351 |
+
|
| 352 |
+
<div class="features">
|
| 353 |
+
<div class="feature">
|
| 354 |
+
<i class="fas fa-seedling"></i>
|
| 355 |
+
Friendly
|
| 356 |
+
</div>
|
| 357 |
+
<div class="feature">
|
| 358 |
+
<i class="fas fa-list-ol"></i>
|
| 359 |
+
Step-by-step
|
| 360 |
+
</div>
|
| 361 |
+
<div class="feature">
|
| 362 |
+
<i class="fas fa-globe"></i>
|
| 363 |
+
Multi-Language
|
| 364 |
+
</div>
|
| 365 |
+
</div>
|
| 366 |
+
</div>
|
| 367 |
+
|
| 368 |
+
<script>
|
| 369 |
+
// Add some interactive effects
|
| 370 |
+
document.addEventListener('DOMContentLoaded', function() {
|
| 371 |
+
const btn = document.querySelector('.redirect-btn');
|
| 372 |
+
const particles = document.querySelectorAll('.particle');
|
| 373 |
+
|
| 374 |
+
// Add click animation
|
| 375 |
+
btn.addEventListener('click', function(e) {
|
| 376 |
+
// Create ripple effect
|
| 377 |
+
const ripple = document.createElement('span');
|
| 378 |
+
const rect = this.getBoundingClientRect();
|
| 379 |
+
const size = Math.max(rect.width, rect.height);
|
| 380 |
+
const x = e.clientX - rect.left - size / 2;
|
| 381 |
+
const y = e.clientY - rect.top - size / 2;
|
| 382 |
+
|
| 383 |
+
ripple.style.cssText = `
|
| 384 |
+
position: absolute;
|
| 385 |
+
width: ${size}px;
|
| 386 |
+
height: ${size}px;
|
| 387 |
+
left: ${x}px;
|
| 388 |
+
top: ${y}px;
|
| 389 |
+
background: rgba(255, 255, 255, 0.3);
|
| 390 |
+
border-radius: 50%;
|
| 391 |
+
transform: scale(0);
|
| 392 |
+
animation: ripple 0.6s ease-out;
|
| 393 |
+
pointer-events: none;
|
| 394 |
+
`;
|
| 395 |
+
|
| 396 |
+
this.appendChild(ripple);
|
| 397 |
+
|
| 398 |
+
setTimeout(() => {
|
| 399 |
+
ripple.remove();
|
| 400 |
+
}, 600);
|
| 401 |
+
});
|
| 402 |
+
|
| 403 |
+
// Add CSS for ripple animation
|
| 404 |
+
const style = document.createElement('style');
|
| 405 |
+
style.textContent = `
|
| 406 |
+
@keyframes ripple {
|
| 407 |
+
to {
|
| 408 |
+
transform: scale(2);
|
| 409 |
+
opacity: 0;
|
| 410 |
+
}
|
| 411 |
+
}
|
| 412 |
+
`;
|
| 413 |
+
document.head.appendChild(style);
|
| 414 |
+
|
| 415 |
+
// Animate particles on mouse move
|
| 416 |
+
document.addEventListener('mousemove', function(e) {
|
| 417 |
+
const x = e.clientX / window.innerWidth;
|
| 418 |
+
const y = e.clientY / window.innerHeight;
|
| 419 |
+
|
| 420 |
+
particles.forEach((particle, index) => {
|
| 421 |
+
const speed = (index + 1) * 0.5;
|
| 422 |
+
const xOffset = (x - 0.5) * speed * 20;
|
| 423 |
+
const yOffset = (y - 0.5) * speed * 20;
|
| 424 |
+
|
| 425 |
+
particle.style.transform = `translate(${xOffset}px, ${yOffset}px)`;
|
| 426 |
+
});
|
| 427 |
+
});
|
| 428 |
+
});
|
| 429 |
+
</script>
|
| 430 |
+
</body>
|
| 431 |
+
</html>
|
| 432 |
+
"""
|
| 433 |
+
|
| 434 |
+
return HTMLResponse(content=html_content)
|
main.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# main.py - Entry point for the Cooking Tutor API
|
| 2 |
+
import uvicorn
|
| 3 |
+
from api.app import app
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
print("🍳 Starting Cooking Tutor API...")
|
| 7 |
+
uvicorn.run(
|
| 8 |
+
app,
|
| 9 |
+
host="0.0.0.0",
|
| 10 |
+
port=7860,
|
| 11 |
+
log_level="info",
|
| 12 |
+
reload=False # Set to True for development
|
| 13 |
+
)
|
memory/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Memory package
|
| 2 |
+
from .memory import MemoryManager
|
memory/memory.py
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# memory_updated.py
|
| 2 |
+
import re, time, hashlib, asyncio, os
|
| 3 |
+
from collections import defaultdict, deque
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import numpy as np
|
| 6 |
+
import faiss
|
| 7 |
+
from sentence_transformers import SentenceTransformer
|
| 8 |
+
from google import genai # must be configured in app.py and imported globally
|
| 9 |
+
import logging
|
| 10 |
+
from models.summarizer import get_summarizer
|
| 11 |
+
|
| 12 |
+
_LLM_SMALL = "gemini-2.5-flash-lite-preview-06-17"
|
| 13 |
+
# Load embedding model
|
| 14 |
+
EMBED = SentenceTransformer("/app/model_cache", device="cpu").half()
|
| 15 |
+
logger = logging.getLogger("rag-agent")
|
| 16 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
|
| 17 |
+
|
| 18 |
+
api_key = os.getenv("FlashAPI")
|
| 19 |
+
client = genai.Client(api_key=api_key)
|
| 20 |
+
|
| 21 |
+
class MemoryManager:
|
| 22 |
+
def __init__(self, max_users=1000, history_per_user=20, max_chunks=60):
|
| 23 |
+
# STM: recent conversation summaries (topic + summary), up to 5 entries
|
| 24 |
+
self.stm_summaries = defaultdict(lambda: deque(maxlen=history_per_user)) # deque of {topic,text,vec,timestamp,used}
|
| 25 |
+
# Legacy raw cache (kept for compatibility if needed)
|
| 26 |
+
self.text_cache = defaultdict(lambda: deque(maxlen=history_per_user))
|
| 27 |
+
# LTM: semantic chunk store (approx 3 chunks x 20 rounds)
|
| 28 |
+
self.chunk_index = defaultdict(self._new_index) # user_id -> faiss index
|
| 29 |
+
self.chunk_meta = defaultdict(list) # '' -> list[{text,tag,vec,timestamp,used}]
|
| 30 |
+
self.user_queue = deque(maxlen=max_users) # LRU of users
|
| 31 |
+
self.max_chunks = max_chunks # hard cap per user
|
| 32 |
+
self.chunk_cache = {} # hash(query+resp) -> [chunks]
|
| 33 |
+
|
| 34 |
+
# ---------- Public API ----------
|
| 35 |
+
def add_exchange(self, user_id: str, query: str, response: str, lang: str = "EN"):
|
| 36 |
+
self._touch_user(user_id)
|
| 37 |
+
# Keep raw record (optional)
|
| 38 |
+
self.text_cache[user_id].append(((query or "").strip(), (response or "").strip()))
|
| 39 |
+
if not response: return []
|
| 40 |
+
# Avoid re-chunking identical response
|
| 41 |
+
cache_key = hashlib.md5((query + response).encode()).hexdigest()
|
| 42 |
+
if cache_key in self.chunk_cache:
|
| 43 |
+
chunks = self.chunk_cache[cache_key]
|
| 44 |
+
else:
|
| 45 |
+
chunks = self.chunk_response(response, lang, question=query)
|
| 46 |
+
self.chunk_cache[cache_key] = chunks
|
| 47 |
+
# Update STM with merging/deduplication
|
| 48 |
+
for chunk in chunks:
|
| 49 |
+
self._upsert_stm(user_id, chunk, lang)
|
| 50 |
+
# Update LTM with merging/deduplication
|
| 51 |
+
self._upsert_ltm(user_id, chunks, lang)
|
| 52 |
+
return chunks
|
| 53 |
+
|
| 54 |
+
def get_relevant_chunks(self, user_id: str, query: str, top_k: int = 3, min_sim: float = 0.30) -> List[str]:
|
| 55 |
+
"""Return texts of chunks whose cosine similarity ≥ min_sim."""
|
| 56 |
+
if self.chunk_index[user_id].ntotal == 0:
|
| 57 |
+
return []
|
| 58 |
+
# Encode chunk
|
| 59 |
+
qvec = self._embed(query)
|
| 60 |
+
sims, idxs = self.chunk_index[user_id].search(np.array([qvec]), k=top_k)
|
| 61 |
+
results = []
|
| 62 |
+
# Append related result with smart-decay to optimize storage and prioritize most-recent chat
|
| 63 |
+
for sim, idx in zip(sims[0], idxs[0]):
|
| 64 |
+
if idx < len(self.chunk_meta[user_id]) and sim >= min_sim:
|
| 65 |
+
chunk = self.chunk_meta[user_id][idx]
|
| 66 |
+
chunk["used"] += 1 # increment usage
|
| 67 |
+
# Decay function
|
| 68 |
+
age_sec = time.time() - chunk["timestamp"]
|
| 69 |
+
decay = 1.0 / (1.0 + age_sec / 300) # 5-min half-life
|
| 70 |
+
score = sim * decay * (1 + 0.1 * chunk["used"])
|
| 71 |
+
# Append chunk with score
|
| 72 |
+
results.append((score, chunk))
|
| 73 |
+
# Sort result on best scored
|
| 74 |
+
results.sort(key=lambda x: x[0], reverse=True)
|
| 75 |
+
# logger.info(f"[Memory] RAG Retrieved Topic: {results}") # Inspect vector data
|
| 76 |
+
return [f"### Topic: {c['tag']}\n{c['text']}" for _, c in results]
|
| 77 |
+
|
| 78 |
+
def get_recent_chat_history(self, user_id: str, num_turns: int = 5) -> List[Dict]:
|
| 79 |
+
"""
|
| 80 |
+
Get the most recent short-term memory summaries.
|
| 81 |
+
Returns: a list of entries containing only the summarized bot context.
|
| 82 |
+
"""
|
| 83 |
+
if user_id not in self.stm_summaries:
|
| 84 |
+
return []
|
| 85 |
+
recent = list(self.stm_summaries[user_id])[-num_turns:]
|
| 86 |
+
formatted = []
|
| 87 |
+
for entry in recent:
|
| 88 |
+
formatted.append({
|
| 89 |
+
"user": "",
|
| 90 |
+
"bot": f"Topic: {entry['topic']}\n{entry['text']}",
|
| 91 |
+
"timestamp": entry.get("timestamp", time.time())
|
| 92 |
+
})
|
| 93 |
+
return formatted
|
| 94 |
+
|
| 95 |
+
def get_context(self, user_id: str, num_turns: int = 5) -> str:
|
| 96 |
+
# Prefer STM summaries
|
| 97 |
+
history = self.get_recent_chat_history(user_id, num_turns=num_turns)
|
| 98 |
+
return "\n".join(h["bot"] for h in history)
|
| 99 |
+
|
| 100 |
+
def get_contextual_chunks(self, user_id: str, current_query: str, lang: str = "EN") -> str:
|
| 101 |
+
"""
|
| 102 |
+
Use NVIDIA Llama to create a summarization of relevant context from both recent history and RAG chunks.
|
| 103 |
+
This ensures conversational continuity while providing a concise summary for the main LLM.
|
| 104 |
+
"""
|
| 105 |
+
# Get both types of context
|
| 106 |
+
recent_history = self.get_recent_chat_history(user_id, num_turns=5)
|
| 107 |
+
rag_chunks = self.get_relevant_chunks(user_id, current_query, top_k=3)
|
| 108 |
+
|
| 109 |
+
logger.info(f"[Contextual] Retrieved {len(recent_history)} recent history items")
|
| 110 |
+
logger.info(f"[Contextual] Retrieved {len(rag_chunks)} RAG chunks")
|
| 111 |
+
|
| 112 |
+
# Return empty string if no context is found
|
| 113 |
+
if not recent_history and not rag_chunks:
|
| 114 |
+
logger.info(f"[Contextual] No context found, returning empty string")
|
| 115 |
+
return ""
|
| 116 |
+
|
| 117 |
+
# Prepare context for summarization
|
| 118 |
+
context_parts = []
|
| 119 |
+
# Add recent chat history
|
| 120 |
+
if recent_history:
|
| 121 |
+
history_text = "\n".join([
|
| 122 |
+
f"User: {item['user']}\nBot: {item['bot']}"
|
| 123 |
+
for item in recent_history
|
| 124 |
+
])
|
| 125 |
+
context_parts.append(f"Recent conversation history:\n{history_text}")
|
| 126 |
+
# Add RAG chunks
|
| 127 |
+
if rag_chunks:
|
| 128 |
+
rag_text = "\n".join(rag_chunks)
|
| 129 |
+
context_parts.append(f"Semantically relevant historical cooking information:\n{rag_text}")
|
| 130 |
+
|
| 131 |
+
# Combine all context
|
| 132 |
+
full_context = "\n\n".join(context_parts)
|
| 133 |
+
|
| 134 |
+
# Use summarizer to create concise summary
|
| 135 |
+
try:
|
| 136 |
+
summary = summarizer.summarize_text(full_context, max_length=300)
|
| 137 |
+
logger.info(f"[Contextual] Generated summary using NVIDIA Llama: {len(summary)} characters")
|
| 138 |
+
return summary
|
| 139 |
+
except Exception as e:
|
| 140 |
+
logger.error(f"[Contextual] Summarization failed: {e}")
|
| 141 |
+
return full_context[:500] + "..." if len(full_context) > 500 else full_context
|
| 142 |
+
|
| 143 |
+
def chunk_response(self, response: str, lang: str, question: str = "") -> List[Dict]:
|
| 144 |
+
"""
|
| 145 |
+
Use NVIDIA Llama to chunk and summarize response by cooking topics.
|
| 146 |
+
Returns: [{"tag": ..., "text": ...}, ...]
|
| 147 |
+
"""
|
| 148 |
+
if not response:
|
| 149 |
+
return []
|
| 150 |
+
|
| 151 |
+
try:
|
| 152 |
+
# Use summarizer to chunk and summarize
|
| 153 |
+
chunks = summarizer.chunk_response(response, max_chunk_size=500)
|
| 154 |
+
|
| 155 |
+
# Convert to the expected format
|
| 156 |
+
result_chunks = []
|
| 157 |
+
for i, chunk in enumerate(chunks):
|
| 158 |
+
# Extract topic from chunk (first sentence or key cooking terms)
|
| 159 |
+
topic = self._extract_topic_from_chunk(chunk)
|
| 160 |
+
|
| 161 |
+
result_chunks.append({
|
| 162 |
+
"tag": topic,
|
| 163 |
+
"text": chunk
|
| 164 |
+
})
|
| 165 |
+
|
| 166 |
+
logger.info(f"[Memory] 📦 NVIDIA Llama summarized {len(result_chunks)} chunks")
|
| 167 |
+
return result_chunks
|
| 168 |
+
|
| 169 |
+
except Exception as e:
|
| 170 |
+
logger.error(f"[Memory] NVIDIA Llama chunking failed: {e}")
|
| 171 |
+
# Fallback to simple chunking
|
| 172 |
+
return self._fallback_chunking(response)
|
| 173 |
+
|
| 174 |
+
def _extract_topic_from_chunk(self, chunk: str) -> str:
|
| 175 |
+
"""Extract a concise topic from a chunk"""
|
| 176 |
+
# Look for cooking terms or first sentence
|
| 177 |
+
sentences = chunk.split('.')
|
| 178 |
+
if sentences:
|
| 179 |
+
first_sentence = sentences[0].strip()
|
| 180 |
+
if len(first_sentence) > 50:
|
| 181 |
+
first_sentence = first_sentence[:50] + "..."
|
| 182 |
+
return first_sentence
|
| 183 |
+
return "Cooking Information"
|
| 184 |
+
|
| 185 |
+
def _fallback_chunking(self, response: str) -> List[Dict]:
|
| 186 |
+
"""Fallback chunking when NVIDIA Llama fails"""
|
| 187 |
+
# Simple sentence-based chunking
|
| 188 |
+
sentences = re.split(r'[.!?]+', response)
|
| 189 |
+
chunks = []
|
| 190 |
+
current_chunk = ""
|
| 191 |
+
|
| 192 |
+
for sentence in sentences:
|
| 193 |
+
sentence = sentence.strip()
|
| 194 |
+
if not sentence:
|
| 195 |
+
continue
|
| 196 |
+
|
| 197 |
+
if len(current_chunk) + len(sentence) > 300:
|
| 198 |
+
if current_chunk:
|
| 199 |
+
chunks.append({
|
| 200 |
+
"tag": "Cooking Information",
|
| 201 |
+
"text": current_chunk.strip()
|
| 202 |
+
})
|
| 203 |
+
current_chunk = sentence
|
| 204 |
+
else:
|
| 205 |
+
current_chunk += sentence + ". "
|
| 206 |
+
|
| 207 |
+
if current_chunk:
|
| 208 |
+
chunks.append({
|
| 209 |
+
"tag": "Cooking Information",
|
| 210 |
+
"text": current_chunk.strip()
|
| 211 |
+
})
|
| 212 |
+
|
| 213 |
+
return chunks
|
| 214 |
+
|
| 215 |
+
# ---------- Private Methods ----------
|
| 216 |
+
def _touch_user(self, user_id: str):
|
| 217 |
+
"""Update LRU queue"""
|
| 218 |
+
if user_id in self.user_queue:
|
| 219 |
+
self.user_queue.remove(user_id)
|
| 220 |
+
self.user_queue.append(user_id)
|
| 221 |
+
|
| 222 |
+
def _new_index(self):
|
| 223 |
+
"""Create new FAISS index"""
|
| 224 |
+
return faiss.IndexFlatIP(384) # 384-dim embeddings
|
| 225 |
+
|
| 226 |
+
def _upsert_stm(self, user_id: str, chunk: Dict, lang: str):
|
| 227 |
+
"""Update short-term memory with merging/deduplication"""
|
| 228 |
+
topic = chunk["tag"]
|
| 229 |
+
text = chunk["text"]
|
| 230 |
+
|
| 231 |
+
# Check for similar topics in STM
|
| 232 |
+
for entry in self.stm_summaries[user_id]:
|
| 233 |
+
if self._topics_similar(topic, entry["topic"]):
|
| 234 |
+
# Merge with existing entry
|
| 235 |
+
entry["text"] = summarizer.summarize_text(
|
| 236 |
+
f"{entry['text']}\n{text}",
|
| 237 |
+
max_length=200
|
| 238 |
+
)
|
| 239 |
+
entry["timestamp"] = time.time()
|
| 240 |
+
return
|
| 241 |
+
|
| 242 |
+
# Add new entry
|
| 243 |
+
self.stm_summaries[user_id].append({
|
| 244 |
+
"topic": topic,
|
| 245 |
+
"text": text,
|
| 246 |
+
"vec": self._embed(f"{topic} {text}"),
|
| 247 |
+
"timestamp": time.time(),
|
| 248 |
+
"used": 0
|
| 249 |
+
})
|
| 250 |
+
|
| 251 |
+
def _upsert_ltm(self, user_id: str, chunks: List[Dict], lang: str):
|
| 252 |
+
"""Update long-term memory with merging/deduplication"""
|
| 253 |
+
for chunk in chunks:
|
| 254 |
+
# Check for similar chunks in LTM
|
| 255 |
+
similar_idx = self._find_similar_chunk(user_id, chunk["text"])
|
| 256 |
+
|
| 257 |
+
if similar_idx is not None:
|
| 258 |
+
# Merge with existing chunk
|
| 259 |
+
existing = self.chunk_meta[user_id][similar_idx]
|
| 260 |
+
merged_text = summarizer.summarize_text(
|
| 261 |
+
f"{existing['text']}\n{chunk['text']}",
|
| 262 |
+
max_length=300
|
| 263 |
+
)
|
| 264 |
+
existing["text"] = merged_text
|
| 265 |
+
existing["timestamp"] = time.time()
|
| 266 |
+
else:
|
| 267 |
+
# Add new chunk
|
| 268 |
+
if len(self.chunk_meta[user_id]) >= self.max_chunks:
|
| 269 |
+
# Remove oldest chunk
|
| 270 |
+
self._remove_oldest_chunk(user_id)
|
| 271 |
+
|
| 272 |
+
vec = self._embed(chunk["text"])
|
| 273 |
+
self.chunk_index[user_id].add(np.array([vec]))
|
| 274 |
+
self.chunk_meta[user_id].append({
|
| 275 |
+
"text": chunk["text"],
|
| 276 |
+
"tag": chunk["tag"],
|
| 277 |
+
"vec": vec,
|
| 278 |
+
"timestamp": time.time(),
|
| 279 |
+
"used": 0
|
| 280 |
+
})
|
| 281 |
+
|
| 282 |
+
def _topics_similar(self, topic1: str, topic2: str) -> bool:
|
| 283 |
+
"""Check if two topics are similar"""
|
| 284 |
+
# Simple similarity check based on common words
|
| 285 |
+
words1 = set(topic1.lower().split())
|
| 286 |
+
words2 = set(topic2.lower().split())
|
| 287 |
+
intersection = words1.intersection(words2)
|
| 288 |
+
return len(intersection) >= 2
|
| 289 |
+
|
| 290 |
+
def _find_similar_chunk(self, user_id: str, text: str) -> int:
|
| 291 |
+
"""Find similar chunk in LTM"""
|
| 292 |
+
if not self.chunk_meta[user_id]:
|
| 293 |
+
return None
|
| 294 |
+
|
| 295 |
+
text_vec = self._embed(text)
|
| 296 |
+
sims, idxs = self.chunk_index[user_id].search(np.array([text_vec]), k=3)
|
| 297 |
+
|
| 298 |
+
for sim, idx in zip(sims[0], idxs[0]):
|
| 299 |
+
if sim > 0.8: # High similarity threshold
|
| 300 |
+
return int(idx)
|
| 301 |
+
return None
|
| 302 |
+
|
| 303 |
+
def _remove_oldest_chunk(self, user_id: str):
|
| 304 |
+
"""Remove the oldest chunk from LTM"""
|
| 305 |
+
if not self.chunk_meta[user_id]:
|
| 306 |
+
return
|
| 307 |
+
|
| 308 |
+
# Find oldest chunk
|
| 309 |
+
oldest_idx = min(range(len(self.chunk_meta[user_id])),
|
| 310 |
+
key=lambda i: self.chunk_meta[user_id][i]["timestamp"])
|
| 311 |
+
|
| 312 |
+
# Remove from index and metadata
|
| 313 |
+
self.chunk_meta[user_id].pop(oldest_idx)
|
| 314 |
+
# Note: FAISS doesn't support direct removal, so we rebuild the index
|
| 315 |
+
self._rebuild_index(user_id)
|
| 316 |
+
|
| 317 |
+
def _rebuild_index(self, user_id: str):
|
| 318 |
+
"""Rebuild FAISS index after removal"""
|
| 319 |
+
if not self.chunk_meta[user_id]:
|
| 320 |
+
self.chunk_index[user_id] = self._new_index()
|
| 321 |
+
return
|
| 322 |
+
|
| 323 |
+
vectors = [chunk["vec"] for chunk in self.chunk_meta[user_id]]
|
| 324 |
+
self.chunk_index[user_id] = self._new_index()
|
| 325 |
+
self.chunk_index[user_id].add(np.array(vectors))
|
| 326 |
+
|
| 327 |
+
@staticmethod
|
| 328 |
+
def _embed(text: str):
|
| 329 |
+
vec = EMBED.encode(text, convert_to_numpy=True)
|
| 330 |
+
# L2 normalise for cosine on IndexFlatIP
|
| 331 |
+
return vec / (np.linalg.norm(vec) + 1e-9)
|
models/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Models package
|
| 2 |
+
from .llama import NVIDIALLamaClient, process_search_query
|
| 3 |
+
from .summarizer import TextSummarizer, summarizer
|
models/download_model.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# download_model.py
|
| 2 |
+
### --- A. transformer and embedder ---
|
| 3 |
+
import os
|
| 4 |
+
import shutil
|
| 5 |
+
from huggingface_hub import snapshot_download
|
| 6 |
+
|
| 7 |
+
# Set up paths
|
| 8 |
+
MODEL_REPO = "sentence-transformers/all-MiniLM-L6-v2"
|
| 9 |
+
MODEL_CACHE_DIR = "/app/model_cache"
|
| 10 |
+
|
| 11 |
+
print("⏳ Downloading the SentenceTransformer model...")
|
| 12 |
+
model_path = snapshot_download(repo_id=MODEL_REPO, cache_dir=MODEL_CACHE_DIR)
|
| 13 |
+
|
| 14 |
+
print("Model path: ", model_path)
|
| 15 |
+
|
| 16 |
+
# Ensure the directory exists
|
| 17 |
+
if not os.path.exists(MODEL_CACHE_DIR):
|
| 18 |
+
os.makedirs(MODEL_CACHE_DIR)
|
| 19 |
+
|
| 20 |
+
# Move all contents from the snapshot folder
|
| 21 |
+
if os.path.exists(model_path):
|
| 22 |
+
print(f"📂 Moving model files from {model_path} to {MODEL_CACHE_DIR}...")
|
| 23 |
+
|
| 24 |
+
for item in os.listdir(model_path):
|
| 25 |
+
source = os.path.join(model_path, item)
|
| 26 |
+
destination = os.path.join(MODEL_CACHE_DIR, item)
|
| 27 |
+
|
| 28 |
+
if os.path.isdir(source):
|
| 29 |
+
shutil.copytree(source, destination, dirs_exist_ok=True)
|
| 30 |
+
else:
|
| 31 |
+
shutil.copy2(source, destination)
|
| 32 |
+
|
| 33 |
+
print(f"✅ Model extracted and flattened in {MODEL_CACHE_DIR}")
|
| 34 |
+
else:
|
| 35 |
+
print("❌ No snapshot directory found!")
|
| 36 |
+
exit(1)
|
| 37 |
+
|
| 38 |
+
# Verify structure after moving
|
| 39 |
+
print("\n📂 LLM Model Structure (Build Level):")
|
| 40 |
+
for root, dirs, files in os.walk(MODEL_CACHE_DIR):
|
| 41 |
+
print(f"📁 {root}/")
|
| 42 |
+
for file in files:
|
| 43 |
+
print(f" 📄 {file}")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
### --- B. translation modules ---
|
| 47 |
+
from transformers import pipeline
|
| 48 |
+
print("⏬ Downloading Vietnamese–English translator...")
|
| 49 |
+
_ = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en")
|
| 50 |
+
print("⏬ Downloading Chinese–English translator...")
|
| 51 |
+
_ = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
|
models/llama.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
import logging
|
| 5 |
+
import time
|
| 6 |
+
from typing import List, Dict, Tuple
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
class NVIDIALLamaClient:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.api_key = os.getenv("NVIDIA_URI")
|
| 13 |
+
if not self.api_key:
|
| 14 |
+
raise ValueError("NVIDIA_URI environment variable not set")
|
| 15 |
+
|
| 16 |
+
# Correct NVIDIA Integrate API base
|
| 17 |
+
self.base_url = "https://integrate.api.nvidia.com/v1"
|
| 18 |
+
self.model = "meta/llama-3.1-8b-instruct"
|
| 19 |
+
|
| 20 |
+
def generate_keywords(self, user_query: str) -> List[str]:
|
| 21 |
+
"""Use Llama to generate search keywords from user query"""
|
| 22 |
+
try:
|
| 23 |
+
prompt = f"""Given this medical question: "{user_query}"
|
| 24 |
+
|
| 25 |
+
Generate 3-5 specific search keywords that would help find relevant medical information online.
|
| 26 |
+
Focus on medical terms, symptoms, conditions, treatments, or procedures mentioned.
|
| 27 |
+
Return only the keywords separated by commas, no explanations.
|
| 28 |
+
|
| 29 |
+
Keywords:"""
|
| 30 |
+
|
| 31 |
+
response = self._call_llama(prompt)
|
| 32 |
+
|
| 33 |
+
# Extract keywords from response
|
| 34 |
+
keywords = [kw.strip() for kw in response.split(',') if kw.strip()]
|
| 35 |
+
logger.info(f"Generated keywords: {keywords}")
|
| 36 |
+
return keywords[:5] # Limit to 5 keywords
|
| 37 |
+
|
| 38 |
+
except Exception as e:
|
| 39 |
+
logger.error(f"Failed to generate keywords: {e}")
|
| 40 |
+
return [user_query] # Fallback to original query
|
| 41 |
+
|
| 42 |
+
def summarize_documents(self, documents: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
|
| 43 |
+
"""Use Llama to summarize documents and return summary with URL mapping"""
|
| 44 |
+
try:
|
| 45 |
+
# Import summarizer here to avoid circular imports
|
| 46 |
+
from summarizer import summarizer
|
| 47 |
+
|
| 48 |
+
# Use the summarizer for document summarization
|
| 49 |
+
combined_summary, url_mapping = summarizer.summarize_documents(documents, user_query)
|
| 50 |
+
|
| 51 |
+
return combined_summary, url_mapping
|
| 52 |
+
|
| 53 |
+
except Exception as e:
|
| 54 |
+
logger.error(f"Failed to summarize documents: {e}")
|
| 55 |
+
return "", {}
|
| 56 |
+
|
| 57 |
+
def _call_llama(self, prompt: str, max_retries: int = 3) -> str:
|
| 58 |
+
"""Make API call to NVIDIA Llama model with retry logic"""
|
| 59 |
+
for attempt in range(max_retries):
|
| 60 |
+
try:
|
| 61 |
+
headers = {
|
| 62 |
+
"Authorization": f"Bearer {self.api_key}",
|
| 63 |
+
"Content-Type": "application/json"
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
payload = {
|
| 67 |
+
"model": self.model,
|
| 68 |
+
"messages": [
|
| 69 |
+
{
|
| 70 |
+
"role": "user",
|
| 71 |
+
"content": prompt
|
| 72 |
+
}
|
| 73 |
+
],
|
| 74 |
+
"temperature": 0.7,
|
| 75 |
+
"max_tokens": 1000
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
response = requests.post(
|
| 79 |
+
f"{self.base_url}/chat/completions",
|
| 80 |
+
headers=headers,
|
| 81 |
+
json=payload,
|
| 82 |
+
timeout=30
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
response.raise_for_status()
|
| 86 |
+
result = response.json()
|
| 87 |
+
|
| 88 |
+
content = result['choices'][0]['message']['content'].strip()
|
| 89 |
+
if not content:
|
| 90 |
+
raise ValueError("Empty response from Llama API")
|
| 91 |
+
|
| 92 |
+
return content
|
| 93 |
+
|
| 94 |
+
except requests.exceptions.Timeout:
|
| 95 |
+
logger.warning(f"Llama API timeout (attempt {attempt + 1}/{max_retries})")
|
| 96 |
+
if attempt == max_retries - 1:
|
| 97 |
+
raise
|
| 98 |
+
time.sleep(2 ** attempt) # Exponential backoff
|
| 99 |
+
|
| 100 |
+
except requests.exceptions.RequestException as e:
|
| 101 |
+
logger.warning(f"Llama API request failed (attempt {attempt + 1}/{max_retries}): {e}")
|
| 102 |
+
if attempt == max_retries - 1:
|
| 103 |
+
raise
|
| 104 |
+
time.sleep(2 ** attempt)
|
| 105 |
+
|
| 106 |
+
except Exception as e:
|
| 107 |
+
logger.error(f"Llama API call failed: {e}")
|
| 108 |
+
raise
|
| 109 |
+
|
| 110 |
+
def process_search_query(user_query: str, search_results: List[Dict]) -> Tuple[str, Dict[int, str]]:
|
| 111 |
+
"""Process search results using Llama model"""
|
| 112 |
+
try:
|
| 113 |
+
llama_client = NVIDIALLamaClient()
|
| 114 |
+
|
| 115 |
+
# Generate search keywords
|
| 116 |
+
keywords = llama_client.generate_keywords(user_query)
|
| 117 |
+
|
| 118 |
+
# Summarize documents
|
| 119 |
+
summary, url_mapping = llama_client.summarize_documents(search_results, user_query)
|
| 120 |
+
|
| 121 |
+
return summary, url_mapping
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.error(f"Failed to process search query: {e}")
|
| 125 |
+
return "", {}
|
models/summarizer.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import logging
|
| 3 |
+
from typing import List, Dict, Tuple
|
| 4 |
+
from .llama import NVIDIALLamaClient
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
class TextSummarizer:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.llama_client = NVIDIALLamaClient()
|
| 11 |
+
|
| 12 |
+
def clean_text(self, text: str) -> str:
|
| 13 |
+
"""Clean and normalize text for summarization"""
|
| 14 |
+
if not text:
|
| 15 |
+
return ""
|
| 16 |
+
|
| 17 |
+
# Remove common conversation starters and fillers
|
| 18 |
+
conversation_patterns = [
|
| 19 |
+
r'\b(hi|hello|hey|sure|okay|yes|no|thanks|thank you)\b',
|
| 20 |
+
r'\b(here is|this is|let me|i will|i can|i would)\b',
|
| 21 |
+
r'\b(summarize|summary|here\'s|here is)\b',
|
| 22 |
+
r'\b(please|kindly|would you|could you)\b',
|
| 23 |
+
r'\b(um|uh|er|ah|well|so|like|you know)\b'
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
# Remove excessive whitespace and normalize
|
| 27 |
+
text = re.sub(r'\s+', ' ', text)
|
| 28 |
+
text = re.sub(r'\n+', ' ', text)
|
| 29 |
+
|
| 30 |
+
# Remove conversation patterns
|
| 31 |
+
for pattern in conversation_patterns:
|
| 32 |
+
text = re.sub(pattern, '', text, flags=re.IGNORECASE)
|
| 33 |
+
|
| 34 |
+
# Remove extra punctuation and normalize
|
| 35 |
+
text = re.sub(r'[.]{2,}', '.', text)
|
| 36 |
+
text = re.sub(r'[!]{2,}', '!', text)
|
| 37 |
+
text = re.sub(r'[?]{2,}', '?', text)
|
| 38 |
+
|
| 39 |
+
return text.strip()
|
| 40 |
+
|
| 41 |
+
def extract_key_phrases(self, text: str) -> List[str]:
|
| 42 |
+
"""Extract key medical phrases and terms"""
|
| 43 |
+
if not text:
|
| 44 |
+
return []
|
| 45 |
+
|
| 46 |
+
# Medical term patterns
|
| 47 |
+
medical_patterns = [
|
| 48 |
+
r'\b(?:symptoms?|diagnosis|treatment|therapy|medication|drug|disease|condition|syndrome)\b',
|
| 49 |
+
r'\b(?:patient|doctor|physician|medical|clinical|healthcare)\b',
|
| 50 |
+
r'\b(?:blood pressure|heart rate|temperature|pulse|respiration)\b',
|
| 51 |
+
r'\b(?:acute|chronic|severe|mild|moderate|serious|critical)\b',
|
| 52 |
+
r'\b(?:pain|ache|discomfort|swelling|inflammation|infection)\b'
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
key_phrases = []
|
| 56 |
+
for pattern in medical_patterns:
|
| 57 |
+
matches = re.findall(pattern, text, re.IGNORECASE)
|
| 58 |
+
key_phrases.extend(matches)
|
| 59 |
+
|
| 60 |
+
return list(set(key_phrases)) # Remove duplicates
|
| 61 |
+
|
| 62 |
+
def summarize_text(self, text: str, max_length: int = 200) -> str:
|
| 63 |
+
"""Summarize text using NVIDIA Llama model"""
|
| 64 |
+
try:
|
| 65 |
+
if not text or len(text.strip()) < 50:
|
| 66 |
+
return text
|
| 67 |
+
|
| 68 |
+
# Clean the text first
|
| 69 |
+
cleaned_text = self.clean_text(text)
|
| 70 |
+
|
| 71 |
+
# Extract key phrases for context
|
| 72 |
+
key_phrases = self.extract_key_phrases(cleaned_text)
|
| 73 |
+
key_phrases_str = ", ".join(key_phrases[:5]) if key_phrases else "medical information"
|
| 74 |
+
|
| 75 |
+
# Create optimized prompt
|
| 76 |
+
prompt = f"""Summarize this medical text in {max_length} characters or less. Focus only on key medical facts, symptoms, treatments, and diagnoses. Do not include greetings, confirmations, or conversational elements.
|
| 77 |
+
|
| 78 |
+
Key terms: {key_phrases_str}
|
| 79 |
+
|
| 80 |
+
Text: {cleaned_text[:1500]}
|
| 81 |
+
|
| 82 |
+
Summary:"""
|
| 83 |
+
|
| 84 |
+
summary = self.llama_client._call_llama(prompt)
|
| 85 |
+
|
| 86 |
+
# Post-process summary
|
| 87 |
+
summary = self.clean_text(summary)
|
| 88 |
+
|
| 89 |
+
# Ensure it's within length limit
|
| 90 |
+
if len(summary) > max_length:
|
| 91 |
+
summary = summary[:max_length-3] + "..."
|
| 92 |
+
|
| 93 |
+
return summary
|
| 94 |
+
|
| 95 |
+
except Exception as e:
|
| 96 |
+
logger.error(f"Summarization failed: {e}")
|
| 97 |
+
# Fallback to simple truncation
|
| 98 |
+
return self.clean_text(text)[:max_length]
|
| 99 |
+
|
| 100 |
+
def summarize_for_query(self, text: str, query: str, max_length: int = 220) -> str:
|
| 101 |
+
"""Summarize text focusing strictly on information relevant to the query.
|
| 102 |
+
Returns an empty string if nothing relevant is found.
|
| 103 |
+
"""
|
| 104 |
+
try:
|
| 105 |
+
if not text:
|
| 106 |
+
return ""
|
| 107 |
+
cleaned_text = self.clean_text(text)
|
| 108 |
+
if not cleaned_text:
|
| 109 |
+
return ""
|
| 110 |
+
|
| 111 |
+
# Short, strict prompt to avoid verbosity; instruct to output NOTHING if irrelevant
|
| 112 |
+
prompt = (
|
| 113 |
+
f"You extract only medically relevant facts that help answer: '{query}'. "
|
| 114 |
+
f"Respond with a concise bullet list (<= {max_length} chars total). "
|
| 115 |
+
"If the content is irrelevant, respond with EXACTLY: NONE.\n\n"
|
| 116 |
+
f"Content: {cleaned_text[:1600]}\n\nRelevant facts:"
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
summary = self.llama_client._call_llama(prompt)
|
| 120 |
+
summary = self.clean_text(summary)
|
| 121 |
+
if not summary or summary.upper().strip() == "NONE":
|
| 122 |
+
return ""
|
| 123 |
+
if len(summary) > max_length:
|
| 124 |
+
summary = summary[:max_length-3] + "..."
|
| 125 |
+
return summary
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.warning(f"Query-focused summarization failed: {e}")
|
| 128 |
+
return ""
|
| 129 |
+
|
| 130 |
+
def summarize_documents(self, documents: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
|
| 131 |
+
"""Summarize multiple documents with URL mapping"""
|
| 132 |
+
try:
|
| 133 |
+
doc_summaries = []
|
| 134 |
+
url_mapping = {}
|
| 135 |
+
|
| 136 |
+
for doc in documents:
|
| 137 |
+
doc_id = doc['id']
|
| 138 |
+
url_mapping[doc_id] = doc['url']
|
| 139 |
+
|
| 140 |
+
# Create focused summary for each document
|
| 141 |
+
summary_prompt = f"""Summarize this medical document in 2-3 sentences, focusing on information relevant to: "{user_query}"
|
| 142 |
+
|
| 143 |
+
Document: {doc['title']}
|
| 144 |
+
Content: {doc['content'][:800]}
|
| 145 |
+
|
| 146 |
+
Key medical information:"""
|
| 147 |
+
|
| 148 |
+
summary = self.llama_client._call_llama(summary_prompt)
|
| 149 |
+
summary = self.clean_text(summary)
|
| 150 |
+
|
| 151 |
+
doc_summaries.append(f"Document {doc_id}: {summary}")
|
| 152 |
+
|
| 153 |
+
combined_summary = "\n\n".join(doc_summaries)
|
| 154 |
+
return combined_summary, url_mapping
|
| 155 |
+
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.error(f"Document summarization failed: {e}")
|
| 158 |
+
return "", {}
|
| 159 |
+
|
| 160 |
+
def summarize_conversation_chunk(self, chunk: str) -> str:
|
| 161 |
+
"""Summarize a conversation chunk for memory"""
|
| 162 |
+
try:
|
| 163 |
+
if not chunk or len(chunk.strip()) < 30:
|
| 164 |
+
return chunk
|
| 165 |
+
|
| 166 |
+
cleaned_chunk = self.clean_text(chunk)
|
| 167 |
+
|
| 168 |
+
prompt = f"""Summarize this medical conversation in 1-2 sentences. Focus only on medical facts, symptoms, treatments, or diagnoses discussed. Remove greetings and conversational elements.
|
| 169 |
+
|
| 170 |
+
Conversation: {cleaned_chunk[:1000]}
|
| 171 |
+
|
| 172 |
+
Medical summary:"""
|
| 173 |
+
|
| 174 |
+
summary = self.llama_client._call_llama(prompt)
|
| 175 |
+
return self.clean_text(summary)
|
| 176 |
+
|
| 177 |
+
except Exception as e:
|
| 178 |
+
logger.error(f"Conversation summarization failed: {e}")
|
| 179 |
+
return self.clean_text(chunk)[:150]
|
| 180 |
+
|
| 181 |
+
def chunk_response(self, response: str, max_chunk_size: int = 500) -> List[str]:
|
| 182 |
+
"""Split response into chunks and summarize each"""
|
| 183 |
+
try:
|
| 184 |
+
if not response or len(response) <= max_chunk_size:
|
| 185 |
+
return [response]
|
| 186 |
+
|
| 187 |
+
# Split by sentences first
|
| 188 |
+
sentences = re.split(r'[.!?]+', response)
|
| 189 |
+
chunks = []
|
| 190 |
+
current_chunk = ""
|
| 191 |
+
|
| 192 |
+
for sentence in sentences:
|
| 193 |
+
sentence = sentence.strip()
|
| 194 |
+
if not sentence:
|
| 195 |
+
continue
|
| 196 |
+
|
| 197 |
+
# Check if adding this sentence would exceed limit
|
| 198 |
+
if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
|
| 199 |
+
chunks.append(self.summarize_conversation_chunk(current_chunk))
|
| 200 |
+
current_chunk = sentence
|
| 201 |
+
else:
|
| 202 |
+
current_chunk += sentence + ". "
|
| 203 |
+
|
| 204 |
+
# Add the last chunk
|
| 205 |
+
if current_chunk:
|
| 206 |
+
chunks.append(self.summarize_conversation_chunk(current_chunk))
|
| 207 |
+
|
| 208 |
+
return chunks
|
| 209 |
+
|
| 210 |
+
except Exception as e:
|
| 211 |
+
logger.error(f"Response chunking failed: {e}")
|
| 212 |
+
return [response]
|
| 213 |
+
|
| 214 |
+
# Global summarizer instance
|
| 215 |
+
summarizer = TextSummarizer()
|
| 216 |
+
|
models/warmup.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sentence_transformers import SentenceTransformer
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
print("🚀 Warming up model...")
|
| 5 |
+
embedding_model = SentenceTransformer("/app/model_cache", device="cpu")
|
| 6 |
+
# embedding_model = embedding_model.half() # Reduce memory
|
| 7 |
+
embedding_model.to(torch.device("cpu"))
|
| 8 |
+
print("✅ Model warm-up complete!")
|
requirements.txt
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# requirements.txt - Cooking Tutor API
|
| 2 |
+
# **LLMs**
|
| 3 |
+
google-genai
|
| 4 |
+
huggingface_hub
|
| 5 |
+
# **Memory & Embeddings**
|
| 6 |
+
faiss-cpu
|
| 7 |
+
sentence-transformers
|
| 8 |
+
# **Translation**
|
| 9 |
+
transformers
|
| 10 |
+
accelerate
|
| 11 |
+
sentencepiece
|
| 12 |
+
# **Environment**
|
| 13 |
+
python-dotenv
|
| 14 |
+
# **Deployment**
|
| 15 |
+
uvicorn
|
| 16 |
+
fastapi
|
| 17 |
+
torch # For translation models
|
| 18 |
+
psutil # System monitoring
|
| 19 |
+
# **Web Search**
|
| 20 |
+
requests
|
| 21 |
+
beautifulsoup4
|
| 22 |
+
langdetect
|
| 23 |
+
# **Data Processing**
|
| 24 |
+
pandas
|
| 25 |
+
numpy
|
search/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
search/__init__.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Search package
|
| 2 |
+
from .search import WebSearcher, search_web, search_web_with_content, search_medical, search_multilingual_medical, search_videos, search_comprehensive
|
| 3 |
+
from .coordinator import SearchCoordinator
|
| 4 |
+
from .engines import DuckDuckGoEngine, MedicalSearchEngine, MultilingualMedicalEngine, VideoSearchEngine
|
| 5 |
+
from .extractors import ContentExtractor
|
| 6 |
+
from .processors import MedicalSearchProcessor, LanguageProcessor, SourceAggregator, EnhancedContentProcessor
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
'WebSearcher',
|
| 10 |
+
'search_web',
|
| 11 |
+
'search_web_with_content',
|
| 12 |
+
'search_medical',
|
| 13 |
+
'search_multilingual_medical',
|
| 14 |
+
'search_videos',
|
| 15 |
+
'search_comprehensive',
|
| 16 |
+
'SearchCoordinator',
|
| 17 |
+
'DuckDuckGoEngine',
|
| 18 |
+
'MedicalSearchEngine',
|
| 19 |
+
'MultilingualMedicalEngine',
|
| 20 |
+
'VideoSearchEngine',
|
| 21 |
+
'ContentExtractor',
|
| 22 |
+
'MedicalSearchProcessor',
|
| 23 |
+
'LanguageProcessor',
|
| 24 |
+
'SourceAggregator',
|
| 25 |
+
'EnhancedContentProcessor'
|
| 26 |
+
]
|
search/coordinator.py
ADDED
|
@@ -0,0 +1,504 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Tuple
|
| 3 |
+
import time
|
| 4 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 5 |
+
|
| 6 |
+
from .engines.duckduckgo import DuckDuckGoEngine
|
| 7 |
+
from .engines.cooking import CookingSearchEngine
|
| 8 |
+
from .engines.multilingual import MultilingualCookingEngine
|
| 9 |
+
from .engines.video import VideoSearchEngine
|
| 10 |
+
from .extractors.content import ContentExtractor
|
| 11 |
+
from .processors.cooking import CookingSearchProcessor
|
| 12 |
+
from .processors.language import LanguageProcessor
|
| 13 |
+
from .processors.sources import SourceAggregator
|
| 14 |
+
from .processors.enhanced import EnhancedContentProcessor
|
| 15 |
+
# Reranker removed - using simple relevance scoring for cooking content
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
class SearchCoordinator:
|
| 20 |
+
"""Coordinate multiple search strategies for comprehensive cooking information"""
|
| 21 |
+
|
| 22 |
+
def __init__(self, max_workers: int = 3):
|
| 23 |
+
self.max_workers = max_workers
|
| 24 |
+
|
| 25 |
+
# Initialize search engines
|
| 26 |
+
self.duckduckgo_engine = DuckDuckGoEngine()
|
| 27 |
+
self.cooking_engine = CookingSearchEngine()
|
| 28 |
+
self.multilingual_engine = MultilingualCookingEngine()
|
| 29 |
+
self.video_engine = VideoSearchEngine()
|
| 30 |
+
|
| 31 |
+
# Initialize processors
|
| 32 |
+
self.content_extractor = ContentExtractor()
|
| 33 |
+
self.cooking_processor = CookingSearchProcessor()
|
| 34 |
+
self.language_processor = LanguageProcessor()
|
| 35 |
+
self.source_aggregator = SourceAggregator()
|
| 36 |
+
self.enhanced_processor = EnhancedContentProcessor()
|
| 37 |
+
self.reranker = None # No complex reranking needed for cooking content
|
| 38 |
+
|
| 39 |
+
# Search strategies
|
| 40 |
+
self.strategies = [
|
| 41 |
+
self._search_multilingual,
|
| 42 |
+
self._search_duckduckgo,
|
| 43 |
+
self._search_cooking_sources
|
| 44 |
+
]
|
| 45 |
+
|
| 46 |
+
def search(self, query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
|
| 47 |
+
"""Execute comprehensive multilingual search with multiple strategies"""
|
| 48 |
+
logger.info(f"Starting comprehensive multilingual search for: {query}")
|
| 49 |
+
|
| 50 |
+
# Detect and enhance query for multiple languages
|
| 51 |
+
enhanced_queries = self.language_processor.enhance_query(query, target_language)
|
| 52 |
+
logger.info(f"Enhanced queries: {list(enhanced_queries.keys())}")
|
| 53 |
+
|
| 54 |
+
# Execute search strategies in parallel
|
| 55 |
+
all_results = []
|
| 56 |
+
|
| 57 |
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
| 58 |
+
# Submit search tasks for each language
|
| 59 |
+
future_to_strategy = {}
|
| 60 |
+
|
| 61 |
+
for lang, enhanced_query in enhanced_queries.items():
|
| 62 |
+
for strategy in self.strategies:
|
| 63 |
+
future = executor.submit(strategy, enhanced_query, num_results // len(enhanced_queries), lang)
|
| 64 |
+
future_to_strategy[future] = f"{strategy.__name__}_{lang}"
|
| 65 |
+
|
| 66 |
+
# Collect results
|
| 67 |
+
for future in as_completed(future_to_strategy):
|
| 68 |
+
strategy_name = future_to_strategy[future]
|
| 69 |
+
try:
|
| 70 |
+
results = future.result()
|
| 71 |
+
if results:
|
| 72 |
+
all_results.extend(results)
|
| 73 |
+
logger.info(f"{strategy_name} found {len(results)} results")
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.error(f"{strategy_name} failed: {e}")
|
| 76 |
+
|
| 77 |
+
# Remove duplicates and filter by language preference
|
| 78 |
+
unique_results = self._remove_duplicates(all_results)
|
| 79 |
+
if target_language:
|
| 80 |
+
unique_results = self.language_processor.filter_by_language(unique_results, target_language)
|
| 81 |
+
|
| 82 |
+
logger.info(f"Total unique results: {len(unique_results)}")
|
| 83 |
+
|
| 84 |
+
# Extract content from URLs
|
| 85 |
+
enriched_results = self._enrich_with_content(unique_results)
|
| 86 |
+
|
| 87 |
+
# Simple cooking relevance filtering
|
| 88 |
+
if enriched_results:
|
| 89 |
+
cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
|
| 90 |
+
relevant_results = []
|
| 91 |
+
for result in enriched_results:
|
| 92 |
+
title = result.get('title', '').lower()
|
| 93 |
+
content = result.get('content', '').lower()
|
| 94 |
+
if any(keyword in title or keyword in content for keyword in cooking_keywords):
|
| 95 |
+
relevant_results.append(result)
|
| 96 |
+
|
| 97 |
+
if relevant_results:
|
| 98 |
+
enriched_results = relevant_results
|
| 99 |
+
logger.info(f"Filtered to {len(enriched_results)} cooking-relevant results")
|
| 100 |
+
|
| 101 |
+
# Process results into comprehensive summary
|
| 102 |
+
summary, url_mapping = self.cooking_processor.process_results(enriched_results, query)
|
| 103 |
+
|
| 104 |
+
logger.info(f"Multilingual search completed: {len(url_mapping)} sources processed")
|
| 105 |
+
return summary, url_mapping
|
| 106 |
+
|
| 107 |
+
def _search_multilingual(self, query: str, num_results: int, language: str = None) -> List[Dict]:
|
| 108 |
+
"""Search using multilingual medical engine"""
|
| 109 |
+
try:
|
| 110 |
+
if language:
|
| 111 |
+
results = self.multilingual_engine.search_by_language(query, language, num_results)
|
| 112 |
+
else:
|
| 113 |
+
results = self.multilingual_engine.search(query, num_results)
|
| 114 |
+
return results
|
| 115 |
+
except Exception as e:
|
| 116 |
+
logger.error(f"Multilingual search failed: {e}")
|
| 117 |
+
return []
|
| 118 |
+
|
| 119 |
+
def _search_duckduckgo(self, query: str, num_results: int, language: str = None) -> List[Dict]:
|
| 120 |
+
"""Search using DuckDuckGo engine"""
|
| 121 |
+
try:
|
| 122 |
+
results = self.duckduckgo_engine.search(query, num_results)
|
| 123 |
+
return results
|
| 124 |
+
except Exception as e:
|
| 125 |
+
logger.error(f"DuckDuckGo search failed: {e}")
|
| 126 |
+
return []
|
| 127 |
+
|
| 128 |
+
def _search_cooking_sources(self, query: str, num_results: int, language: str = None) -> List[Dict]:
|
| 129 |
+
"""Search using cooking sources engine"""
|
| 130 |
+
try:
|
| 131 |
+
results = self.cooking_engine.search(query, num_results)
|
| 132 |
+
return results
|
| 133 |
+
except Exception as e:
|
| 134 |
+
logger.error(f"Cooking sources search failed: {e}")
|
| 135 |
+
return []
|
| 136 |
+
|
| 137 |
+
def _remove_duplicates(self, results: List[Dict]) -> List[Dict]:
|
| 138 |
+
"""Remove duplicate results based on URL"""
|
| 139 |
+
seen_urls = set()
|
| 140 |
+
unique_results = []
|
| 141 |
+
|
| 142 |
+
for result in results:
|
| 143 |
+
url = result.get('url', '')
|
| 144 |
+
if url and url not in seen_urls:
|
| 145 |
+
seen_urls.add(url)
|
| 146 |
+
unique_results.append(result)
|
| 147 |
+
|
| 148 |
+
return unique_results
|
| 149 |
+
|
| 150 |
+
def _enrich_with_content(self, results: List[Dict]) -> List[Dict]:
|
| 151 |
+
"""Enrich results with extracted content"""
|
| 152 |
+
enriched_results = []
|
| 153 |
+
|
| 154 |
+
# Extract content in parallel
|
| 155 |
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
| 156 |
+
# Submit content extraction tasks
|
| 157 |
+
future_to_result = {
|
| 158 |
+
executor.submit(self.content_extractor.extract, result['url']): result
|
| 159 |
+
for result in results
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
# Collect enriched results
|
| 163 |
+
for future in as_completed(future_to_result):
|
| 164 |
+
original_result = future_to_result[future]
|
| 165 |
+
try:
|
| 166 |
+
content = future.result()
|
| 167 |
+
if content:
|
| 168 |
+
enriched_result = original_result.copy()
|
| 169 |
+
enriched_result['content'] = content
|
| 170 |
+
enriched_results.append(enriched_result)
|
| 171 |
+
except Exception as e:
|
| 172 |
+
logger.warning(f"Content extraction failed for {original_result['url']}: {e}")
|
| 173 |
+
# Still include result without content
|
| 174 |
+
enriched_results.append(original_result)
|
| 175 |
+
|
| 176 |
+
return enriched_results
|
| 177 |
+
|
| 178 |
+
def quick_search(self, query: str, num_results: int = 5) -> List[Dict]:
|
| 179 |
+
"""Quick search for basic results without content extraction"""
|
| 180 |
+
logger.info(f"Quick search for: {query}")
|
| 181 |
+
|
| 182 |
+
# Use only DuckDuckGo for speed
|
| 183 |
+
results = self.duckduckgo_engine.search(query, num_results)
|
| 184 |
+
|
| 185 |
+
# If no results, try with simplified query
|
| 186 |
+
if not results:
|
| 187 |
+
logger.warning("No results from DuckDuckGo, trying simplified query")
|
| 188 |
+
simplified_query = self._simplify_query(query)
|
| 189 |
+
if simplified_query != query:
|
| 190 |
+
results = self.duckduckgo_engine.search(simplified_query, num_results)
|
| 191 |
+
logger.info(f"Simplified query '{simplified_query}' found {len(results)} results")
|
| 192 |
+
|
| 193 |
+
# If still no results, try cooking engine as fallback
|
| 194 |
+
if not results:
|
| 195 |
+
logger.warning("Still no results, trying cooking engine fallback")
|
| 196 |
+
try:
|
| 197 |
+
cooking_results = self.cooking_engine.search(query, num_results)
|
| 198 |
+
if cooking_results:
|
| 199 |
+
results = cooking_results
|
| 200 |
+
logger.info(f"Cooking engine fallback found {len(results)} results")
|
| 201 |
+
except Exception as e:
|
| 202 |
+
logger.warning(f"Cooking engine fallback failed: {e}")
|
| 203 |
+
|
| 204 |
+
# Remove duplicates
|
| 205 |
+
unique_results = self._remove_duplicates(results)
|
| 206 |
+
|
| 207 |
+
# If we still have no results, create a basic fallback
|
| 208 |
+
if not unique_results:
|
| 209 |
+
logger.warning("No search results found, creating basic fallback")
|
| 210 |
+
unique_results = self._create_fallback_results(query)
|
| 211 |
+
|
| 212 |
+
logger.info(f"Quick search completed: {len(unique_results)} results")
|
| 213 |
+
return unique_results
|
| 214 |
+
|
| 215 |
+
def _simplify_query(self, query: str) -> str:
|
| 216 |
+
"""Simplify query to core cooking terms"""
|
| 217 |
+
if not query:
|
| 218 |
+
return ""
|
| 219 |
+
|
| 220 |
+
# Extract key cooking terms
|
| 221 |
+
import re
|
| 222 |
+
words = query.split()
|
| 223 |
+
|
| 224 |
+
# Keep cooking keywords and important terms
|
| 225 |
+
cooking_keywords = [
|
| 226 |
+
'recipe', 'cooking', 'baking', 'roasting', 'grilling', 'frying', 'boiling', 'steaming',
|
| 227 |
+
'ingredients', 'seasoning', 'spices', 'herbs', 'sauce', 'marinade', 'dressing',
|
| 228 |
+
'technique', 'method', 'temperature', 'timing', 'preparation', 'cooking time',
|
| 229 |
+
'oven', 'stovetop', 'grill', 'pan', 'pot', 'skillet', 'knife', 'cutting',
|
| 230 |
+
'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
|
| 231 |
+
'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
|
| 232 |
+
'cuisine', 'italian', 'chinese', 'mexican', 'french', 'indian', 'thai'
|
| 233 |
+
]
|
| 234 |
+
|
| 235 |
+
# Keep words that are cooking keywords or are important (longer than 3 chars)
|
| 236 |
+
important_words = []
|
| 237 |
+
for word in words:
|
| 238 |
+
word_lower = word.lower()
|
| 239 |
+
if word_lower in cooking_keywords or len(word) > 3:
|
| 240 |
+
important_words.append(word)
|
| 241 |
+
|
| 242 |
+
# If we have important words, use them; otherwise use first few words
|
| 243 |
+
if important_words:
|
| 244 |
+
return ' '.join(important_words[:5]) # Max 5 words
|
| 245 |
+
else:
|
| 246 |
+
return ' '.join(words[:3]) # Max 3 words
|
| 247 |
+
|
| 248 |
+
def _create_fallback_results(self, query: str) -> List[Dict]:
|
| 249 |
+
"""Create basic fallback results when search fails"""
|
| 250 |
+
# Create some basic cooking information URLs as fallback
|
| 251 |
+
fallback_urls = [
|
| 252 |
+
"https://www.allrecipes.com",
|
| 253 |
+
"https://www.foodnetwork.com",
|
| 254 |
+
"https://www.epicurious.com",
|
| 255 |
+
"https://www.seriouseats.com",
|
| 256 |
+
"https://www.bonappetit.com"
|
| 257 |
+
]
|
| 258 |
+
|
| 259 |
+
results = []
|
| 260 |
+
for i, url in enumerate(fallback_urls[:3]): # Limit to 3 fallback results
|
| 261 |
+
results.append({
|
| 262 |
+
'url': url,
|
| 263 |
+
'title': f"Cooking Information - {query}",
|
| 264 |
+
'source': 'fallback',
|
| 265 |
+
'composite_score': 0.3 - (i * 0.05) # Decreasing score
|
| 266 |
+
})
|
| 267 |
+
|
| 268 |
+
return results
|
| 269 |
+
|
| 270 |
+
def cooking_focus_search(self, query: str, num_results: int = 8) -> Tuple[str, Dict[int, str]]:
|
| 271 |
+
"""Cooking-focused search with enhanced processing"""
|
| 272 |
+
logger.info(f"Cooking focus search for: {query}")
|
| 273 |
+
|
| 274 |
+
# Use cooking engine primarily
|
| 275 |
+
cooking_results = self.cooking_engine.search(query, num_results)
|
| 276 |
+
|
| 277 |
+
# Add some general results for context
|
| 278 |
+
general_results = self.duckduckgo_engine.search(query, 3)
|
| 279 |
+
|
| 280 |
+
# Combine and deduplicate
|
| 281 |
+
all_results = self._remove_duplicates(cooking_results + general_results)
|
| 282 |
+
|
| 283 |
+
# Enrich with content
|
| 284 |
+
enriched_results = self._enrich_with_content(all_results)
|
| 285 |
+
|
| 286 |
+
# Simple cooking relevance filtering
|
| 287 |
+
if enriched_results:
|
| 288 |
+
cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
|
| 289 |
+
relevant_results = []
|
| 290 |
+
for result in enriched_results:
|
| 291 |
+
title = result.get('title', '').lower()
|
| 292 |
+
content = result.get('content', '').lower()
|
| 293 |
+
if any(keyword in title or keyword in content for keyword in cooking_keywords):
|
| 294 |
+
relevant_results.append(result)
|
| 295 |
+
|
| 296 |
+
if relevant_results:
|
| 297 |
+
enriched_results = relevant_results
|
| 298 |
+
logger.info(f"Filtered to {len(enriched_results)} cooking-relevant results")
|
| 299 |
+
|
| 300 |
+
# Process with cooking focus
|
| 301 |
+
summary, url_mapping = self.cooking_processor.process_results(enriched_results, query)
|
| 302 |
+
|
| 303 |
+
logger.info(f"Cooking focus search completed: {len(url_mapping)} sources")
|
| 304 |
+
return summary, url_mapping
|
| 305 |
+
|
| 306 |
+
def multilingual_cooking_search(self, query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
|
| 307 |
+
"""Comprehensive multilingual cooking search"""
|
| 308 |
+
logger.info(f"Multilingual cooking search for: {query} (target: {target_language})")
|
| 309 |
+
|
| 310 |
+
# Detect source language
|
| 311 |
+
source_language = self.language_processor.detect_language(query)
|
| 312 |
+
logger.info(f"Detected source language: {source_language}")
|
| 313 |
+
|
| 314 |
+
# Use multilingual search with language preference
|
| 315 |
+
summary, url_mapping = self.search(query, num_results, target_language)
|
| 316 |
+
|
| 317 |
+
logger.info(f"Multilingual cooking search completed: {len(url_mapping)} sources")
|
| 318 |
+
return summary, url_mapping
|
| 319 |
+
|
| 320 |
+
def comprehensive_search(self, query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True) -> Tuple[str, Dict[int, str], Dict]:
|
| 321 |
+
"""Comprehensive search with maximum information extraction and detailed references"""
|
| 322 |
+
logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
|
| 323 |
+
|
| 324 |
+
# Detect source language
|
| 325 |
+
source_language = self.language_processor.detect_language(query)
|
| 326 |
+
logger.info(f"Detected source language: {source_language}")
|
| 327 |
+
|
| 328 |
+
# Execute comprehensive search
|
| 329 |
+
search_results = []
|
| 330 |
+
video_results = []
|
| 331 |
+
|
| 332 |
+
# 1. Multilingual text search
|
| 333 |
+
text_summary, text_url_mapping = self.search(query, num_results, target_language)
|
| 334 |
+
|
| 335 |
+
# 2. Video search if requested
|
| 336 |
+
if include_videos:
|
| 337 |
+
try:
|
| 338 |
+
video_results = self.video_search(query, num_results=5, target_language=target_language)
|
| 339 |
+
logger.info(f"Video search found {len(video_results)} videos")
|
| 340 |
+
except Exception as e:
|
| 341 |
+
logger.warning(f"Video search failed: {e}")
|
| 342 |
+
|
| 343 |
+
# 3. Aggregate all sources
|
| 344 |
+
all_sources = []
|
| 345 |
+
|
| 346 |
+
# Add text sources
|
| 347 |
+
for i, url in text_url_mapping.items():
|
| 348 |
+
# Find corresponding source data
|
| 349 |
+
source_data = self._find_source_data(url, text_url_mapping)
|
| 350 |
+
if source_data:
|
| 351 |
+
all_sources.append(source_data)
|
| 352 |
+
|
| 353 |
+
# Add video sources
|
| 354 |
+
for video in video_results:
|
| 355 |
+
all_sources.append(video)
|
| 356 |
+
|
| 357 |
+
# 4. Process with enhanced content processor
|
| 358 |
+
if all_sources:
|
| 359 |
+
comprehensive_summary, detailed_mapping = self.enhanced_processor.process_comprehensive_content(all_sources, query)
|
| 360 |
+
else:
|
| 361 |
+
comprehensive_summary = text_summary
|
| 362 |
+
detailed_mapping = text_url_mapping
|
| 363 |
+
|
| 364 |
+
# 5. Create comprehensive source aggregation
|
| 365 |
+
source_aggregation = self.source_aggregator.aggregate_sources(all_sources, video_results)
|
| 366 |
+
|
| 367 |
+
# 6. Generate comprehensive references
|
| 368 |
+
comprehensive_references = self.source_aggregator.create_comprehensive_references(all_sources, max_references=20)
|
| 369 |
+
|
| 370 |
+
# 7. Add inline citations
|
| 371 |
+
final_summary = self.enhanced_processor.create_inline_citations(comprehensive_summary, detailed_mapping)
|
| 372 |
+
|
| 373 |
+
# 8. Add source statistics
|
| 374 |
+
source_stats = self.enhanced_processor.generate_source_statistics(all_sources)
|
| 375 |
+
|
| 376 |
+
# 9. Combine everything
|
| 377 |
+
final_response = f"{final_summary}\n\n{comprehensive_references}\n\n{source_stats}"
|
| 378 |
+
|
| 379 |
+
logger.info(f"Comprehensive search completed: {len(all_sources)} total sources processed")
|
| 380 |
+
|
| 381 |
+
return final_response, detailed_mapping, source_aggregation
|
| 382 |
+
|
| 383 |
+
def _find_source_data(self, url: str, url_mapping: Dict[int, str]) -> Dict:
|
| 384 |
+
"""Find source data for a given URL"""
|
| 385 |
+
# This is a simplified version - ensure required fields always exist
|
| 386 |
+
return {
|
| 387 |
+
'url': url,
|
| 388 |
+
'title': f"Source: {url}",
|
| 389 |
+
'content': '',
|
| 390 |
+
'domain': self._extract_domain(url),
|
| 391 |
+
'type': 'text',
|
| 392 |
+
'source_type': 'text',
|
| 393 |
+
'language': 'en',
|
| 394 |
+
'source_name': '',
|
| 395 |
+
'platform': ''
|
| 396 |
+
}
|
| 397 |
+
|
| 398 |
+
def _extract_domain(self, url: str) -> str:
|
| 399 |
+
"""Extract domain from URL"""
|
| 400 |
+
try:
|
| 401 |
+
from urllib.parse import urlparse
|
| 402 |
+
parsed = urlparse(url)
|
| 403 |
+
domain = parsed.netloc.lower()
|
| 404 |
+
if domain.startswith('www.'):
|
| 405 |
+
domain = domain[4:]
|
| 406 |
+
return domain
|
| 407 |
+
except:
|
| 408 |
+
return ''
|
| 409 |
+
|
| 410 |
+
def video_search(self, query: str, num_results: int = 3, target_language: str = None) -> List[Dict]:
|
| 411 |
+
"""Search for cooking videos across multiple platforms"""
|
| 412 |
+
logger.info(f"Video search for: {query} (target: {target_language})")
|
| 413 |
+
|
| 414 |
+
# Detect language if not provided
|
| 415 |
+
if not target_language:
|
| 416 |
+
target_language = self.language_processor.detect_language(query)
|
| 417 |
+
|
| 418 |
+
# Map language codes
|
| 419 |
+
lang_mapping = {
|
| 420 |
+
'EN': 'en',
|
| 421 |
+
'VI': 'vi',
|
| 422 |
+
'ZH': 'zh',
|
| 423 |
+
'en': 'en',
|
| 424 |
+
'vi': 'vi',
|
| 425 |
+
'zh': 'zh'
|
| 426 |
+
}
|
| 427 |
+
search_language = lang_mapping.get(target_language, 'en')
|
| 428 |
+
|
| 429 |
+
# Search for videos
|
| 430 |
+
raw_results = self.video_engine.search(query, num_results, search_language)
|
| 431 |
+
|
| 432 |
+
# Simple video relevance filtering
|
| 433 |
+
cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
|
| 434 |
+
filtered_video_results = []
|
| 435 |
+
for result in raw_results:
|
| 436 |
+
title = result.get('title', '').lower()
|
| 437 |
+
if any(keyword in title for keyword in cooking_keywords):
|
| 438 |
+
filtered_video_results.append(result)
|
| 439 |
+
|
| 440 |
+
# Validate and normalize results to avoid corrupted cards/links
|
| 441 |
+
video_results = self._sanitize_video_results(filtered_video_results, limit=num_results)
|
| 442 |
+
|
| 443 |
+
logger.info(f"Video search completed: {len(video_results)} videos found")
|
| 444 |
+
return video_results
|
| 445 |
+
|
| 446 |
+
def _sanitize_video_results(self, results: List[Dict], limit: int = 4) -> List[Dict]:
|
| 447 |
+
"""Ensure each video has a valid absolute https URL, reasonable title, and platform metadata.
|
| 448 |
+
Drop unreachable/broken items and deduplicate by URL.
|
| 449 |
+
"""
|
| 450 |
+
from urllib.parse import urlparse
|
| 451 |
+
import requests
|
| 452 |
+
clean: List[Dict] = []
|
| 453 |
+
seen = set()
|
| 454 |
+
for item in results or []:
|
| 455 |
+
url = (item or {}).get('url', '')
|
| 456 |
+
title = (item or {}).get('title', '').strip()
|
| 457 |
+
if not url or not title:
|
| 458 |
+
continue
|
| 459 |
+
try:
|
| 460 |
+
parsed = urlparse(url)
|
| 461 |
+
if parsed.scheme not in ('http', 'https'):
|
| 462 |
+
continue
|
| 463 |
+
if not parsed.netloc:
|
| 464 |
+
continue
|
| 465 |
+
# Quick reachability check; YouTube often blocks HEAD, so skip strict checks for youtube domain
|
| 466 |
+
host = parsed.netloc.lower()
|
| 467 |
+
norm_url = url
|
| 468 |
+
if 'youtube.com' not in host:
|
| 469 |
+
try:
|
| 470 |
+
r = requests.head(url, allow_redirects=True, timeout=3)
|
| 471 |
+
if r.status_code >= 400:
|
| 472 |
+
continue
|
| 473 |
+
norm_url = getattr(r, 'url', url) or url
|
| 474 |
+
except Exception:
|
| 475 |
+
# If HEAD blocked, try a light GET with small timeout
|
| 476 |
+
try:
|
| 477 |
+
r = requests.get(url, stream=True, timeout=4)
|
| 478 |
+
if r.status_code >= 400:
|
| 479 |
+
continue
|
| 480 |
+
norm_url = getattr(r, 'url', url) or url
|
| 481 |
+
except Exception:
|
| 482 |
+
continue
|
| 483 |
+
if norm_url in seen:
|
| 484 |
+
continue
|
| 485 |
+
seen.add(norm_url)
|
| 486 |
+
platform = parsed.netloc.lower()
|
| 487 |
+
if platform.startswith('www.'):
|
| 488 |
+
platform = platform[4:]
|
| 489 |
+
clean.append({
|
| 490 |
+
'title': title,
|
| 491 |
+
'url': norm_url,
|
| 492 |
+
'thumbnail': item.get('thumbnail', ''),
|
| 493 |
+
'source': item.get('source', platform.split('.')[0]),
|
| 494 |
+
'platform': platform,
|
| 495 |
+
'language': item.get('language', 'en')
|
| 496 |
+
})
|
| 497 |
+
if len(clean) >= limit:
|
| 498 |
+
break
|
| 499 |
+
except Exception:
|
| 500 |
+
continue
|
| 501 |
+
return clean
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
|
search/engines/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .duckduckgo import DuckDuckGoEngine
|
| 2 |
+
from .medical import MedicalSearchEngine
|
| 3 |
+
from .multilingual import MultilingualMedicalEngine
|
| 4 |
+
from .video import VideoSearchEngine
|
| 5 |
+
|
| 6 |
+
__all__ = ['DuckDuckGoEngine', 'MedicalSearchEngine', 'MultilingualMedicalEngine', 'VideoSearchEngine']
|
search/engines/cooking.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
class CookingSearchEngine:
|
| 10 |
+
"""Specialized cooking search engine with curated sources"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, timeout: int = 15):
|
| 13 |
+
self.session = requests.Session()
|
| 14 |
+
self.session.headers.update({
|
| 15 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
| 16 |
+
})
|
| 17 |
+
self.timeout = timeout
|
| 18 |
+
|
| 19 |
+
# Curated cooking sources
|
| 20 |
+
self.cooking_sources = {
|
| 21 |
+
'allrecipes': {
|
| 22 |
+
'base_url': 'https://www.allrecipes.com',
|
| 23 |
+
'search_url': 'https://www.allrecipes.com/search',
|
| 24 |
+
'domains': ['allrecipes.com']
|
| 25 |
+
},
|
| 26 |
+
'food_network': {
|
| 27 |
+
'base_url': 'https://www.foodnetwork.com',
|
| 28 |
+
'search_url': 'https://www.foodnetwork.com/search',
|
| 29 |
+
'domains': ['foodnetwork.com']
|
| 30 |
+
},
|
| 31 |
+
'epicurious': {
|
| 32 |
+
'base_url': 'https://www.epicurious.com',
|
| 33 |
+
'search_url': 'https://www.epicurious.com/search',
|
| 34 |
+
'domains': ['epicurious.com']
|
| 35 |
+
},
|
| 36 |
+
'serious_eats': {
|
| 37 |
+
'base_url': 'https://www.seriouseats.com',
|
| 38 |
+
'search_url': 'https://www.seriouseats.com/search',
|
| 39 |
+
'domains': ['seriouseats.com']
|
| 40 |
+
},
|
| 41 |
+
'bon_appetit': {
|
| 42 |
+
'base_url': 'https://www.bonappetit.com',
|
| 43 |
+
'search_url': 'https://www.bonappetit.com/search',
|
| 44 |
+
'domains': ['bonappetit.com']
|
| 45 |
+
}
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
def search(self, query: str, num_results: int = 10) -> List[Dict]:
|
| 49 |
+
"""Search cooking sources for relevant information"""
|
| 50 |
+
results = []
|
| 51 |
+
|
| 52 |
+
# Strategy 1: Direct cooking source searches
|
| 53 |
+
for source_name, source_config in self.cooking_sources.items():
|
| 54 |
+
if len(results) >= num_results:
|
| 55 |
+
break
|
| 56 |
+
|
| 57 |
+
source_results = self._search_cooking_source(query, source_name, source_config)
|
| 58 |
+
results.extend(source_results)
|
| 59 |
+
|
| 60 |
+
# Add delay between requests
|
| 61 |
+
time.sleep(0.5)
|
| 62 |
+
|
| 63 |
+
# Strategy 2: Cooking fallback sources
|
| 64 |
+
if len(results) < num_results:
|
| 65 |
+
fallback_results = self._get_fallback_sources(query, num_results - len(results))
|
| 66 |
+
results.extend(fallback_results)
|
| 67 |
+
|
| 68 |
+
return results[:num_results]
|
| 69 |
+
|
| 70 |
+
def _search_cooking_source(self, query: str, source_name: str, source_config: Dict) -> List[Dict]:
|
| 71 |
+
"""Search a specific cooking source"""
|
| 72 |
+
try:
|
| 73 |
+
search_url = source_config.get('search_url')
|
| 74 |
+
if not search_url:
|
| 75 |
+
return []
|
| 76 |
+
|
| 77 |
+
params = {
|
| 78 |
+
'q': query,
|
| 79 |
+
'query': query,
|
| 80 |
+
'search': query
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
response = self.session.get(search_url, params=params, timeout=self.timeout)
|
| 84 |
+
response.raise_for_status()
|
| 85 |
+
|
| 86 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 87 |
+
results = []
|
| 88 |
+
|
| 89 |
+
# Source-specific selectors
|
| 90 |
+
selectors = self._get_source_selectors(source_name)
|
| 91 |
+
|
| 92 |
+
for selector in selectors:
|
| 93 |
+
links = soup.select(selector)
|
| 94 |
+
if links:
|
| 95 |
+
logger.info(f"{source_name} found {len(links)} results with selector: {selector}")
|
| 96 |
+
break
|
| 97 |
+
|
| 98 |
+
for link in links[:3]: # Limit per source
|
| 99 |
+
try:
|
| 100 |
+
href = link.get('href')
|
| 101 |
+
if not href:
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
# Make absolute URL
|
| 105 |
+
if href.startswith('/'):
|
| 106 |
+
href = source_config['base_url'] + href
|
| 107 |
+
|
| 108 |
+
title = link.get_text(strip=True)
|
| 109 |
+
if title and href.startswith('http'):
|
| 110 |
+
results.append({
|
| 111 |
+
'url': href,
|
| 112 |
+
'title': title,
|
| 113 |
+
'source': source_name,
|
| 114 |
+
'domain': source_config['domains'][0]
|
| 115 |
+
})
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.debug(f"Error parsing {source_name} link: {e}")
|
| 118 |
+
continue
|
| 119 |
+
|
| 120 |
+
return results
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
logger.warning(f"Cooking source {source_name} search failed: {e}")
|
| 124 |
+
return []
|
| 125 |
+
|
| 126 |
+
def _get_source_selectors(self, source_name: str) -> List[str]:
|
| 127 |
+
"""Get CSS selectors for specific cooking sources"""
|
| 128 |
+
selectors_map = {
|
| 129 |
+
'allrecipes': [
|
| 130 |
+
'a[href*="/recipe/"]',
|
| 131 |
+
'a[href*="/recipes/"]',
|
| 132 |
+
'.search-result a',
|
| 133 |
+
'.result-title a'
|
| 134 |
+
],
|
| 135 |
+
'food_network': [
|
| 136 |
+
'a[href*="/recipes/"]',
|
| 137 |
+
'.search-result a',
|
| 138 |
+
'.result-title a',
|
| 139 |
+
'a[href*="/recipe/"]'
|
| 140 |
+
],
|
| 141 |
+
'epicurious': [
|
| 142 |
+
'a[href*="/recipes/"]',
|
| 143 |
+
'.search-result a',
|
| 144 |
+
'.result-title a',
|
| 145 |
+
'a[href*="/recipe/"]'
|
| 146 |
+
],
|
| 147 |
+
'serious_eats': [
|
| 148 |
+
'a[href*="/recipes/"]',
|
| 149 |
+
'.search-result a',
|
| 150 |
+
'.result-title a',
|
| 151 |
+
'a[href*="/recipe/"]'
|
| 152 |
+
],
|
| 153 |
+
'bon_appetit': [
|
| 154 |
+
'a[href*="/recipes/"]',
|
| 155 |
+
'.search-result a',
|
| 156 |
+
'.result-title a',
|
| 157 |
+
'a[href*="/recipe/"]'
|
| 158 |
+
]
|
| 159 |
+
}
|
| 160 |
+
return selectors_map.get(source_name, ['a[href*="http"]'])
|
| 161 |
+
|
| 162 |
+
def _get_fallback_sources(self, query: str, num_results: int) -> List[Dict]:
|
| 163 |
+
"""Get fallback cooking sources when direct search fails"""
|
| 164 |
+
fallback_sources = [
|
| 165 |
+
{
|
| 166 |
+
'url': 'https://www.allrecipes.com/recipes',
|
| 167 |
+
'title': f'AllRecipes: {query}',
|
| 168 |
+
'source': 'allrecipes_fallback',
|
| 169 |
+
'domain': 'allrecipes.com'
|
| 170 |
+
},
|
| 171 |
+
{
|
| 172 |
+
'url': 'https://www.foodnetwork.com/recipes',
|
| 173 |
+
'title': f'Food Network: {query}',
|
| 174 |
+
'source': 'foodnetwork_fallback',
|
| 175 |
+
'domain': 'foodnetwork.com'
|
| 176 |
+
},
|
| 177 |
+
{
|
| 178 |
+
'url': 'https://www.epicurious.com/recipes-menus',
|
| 179 |
+
'title': f'Epicurious: {query}',
|
| 180 |
+
'source': 'epicurious_fallback',
|
| 181 |
+
'domain': 'epicurious.com'
|
| 182 |
+
},
|
| 183 |
+
{
|
| 184 |
+
'url': 'https://www.seriouseats.com/recipes',
|
| 185 |
+
'title': f'Serious Eats: {query}',
|
| 186 |
+
'source': 'seriouseats_fallback',
|
| 187 |
+
'domain': 'seriouseats.com'
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
'url': 'https://www.bonappetit.com/recipes',
|
| 191 |
+
'title': f'Bon Appétit: {query}',
|
| 192 |
+
'source': 'bonappetit_fallback',
|
| 193 |
+
'domain': 'bonappetit.com'
|
| 194 |
+
}
|
| 195 |
+
]
|
| 196 |
+
|
| 197 |
+
return fallback_sources[:num_results]
|
search/engines/duckduckgo.py
ADDED
|
@@ -0,0 +1,599 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import time
|
| 6 |
+
from models.reranker import MedicalReranker
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
class DuckDuckGoEngine:
|
| 11 |
+
"""DuckDuckGo search engine with multiple strategies"""
|
| 12 |
+
|
| 13 |
+
def __init__(self, timeout: int = 15):
|
| 14 |
+
self.session = requests.Session()
|
| 15 |
+
self.session.headers.update({
|
| 16 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 17 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 18 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 19 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 20 |
+
'Connection': 'keep-alive',
|
| 21 |
+
'Upgrade-Insecure-Requests': '1',
|
| 22 |
+
})
|
| 23 |
+
self.timeout = timeout
|
| 24 |
+
self.reranker = MedicalReranker()
|
| 25 |
+
|
| 26 |
+
def search(self, query: str, num_results: int = 10) -> List[Dict]:
|
| 27 |
+
"""Search with multiple DuckDuckGo strategies and medical focus"""
|
| 28 |
+
# Clean and simplify the query first
|
| 29 |
+
clean_query = self._clean_query(query)
|
| 30 |
+
logger.info(f"Cleaned query: '{query}' -> '{clean_query}'")
|
| 31 |
+
|
| 32 |
+
results = []
|
| 33 |
+
min_score = 0.15 # Reduced from 0.3 to be less strict
|
| 34 |
+
|
| 35 |
+
# Strategy 1: HTML Interface with medical focus
|
| 36 |
+
html_results = self._search_html(clean_query, num_results * 3) # Get more to filter
|
| 37 |
+
if html_results:
|
| 38 |
+
results.extend(html_results)
|
| 39 |
+
logger.info(f"DuckDuckGo HTML found {len(html_results)} results")
|
| 40 |
+
|
| 41 |
+
# Strategy 2: Instant Answer API
|
| 42 |
+
if len(results) < num_results * 2:
|
| 43 |
+
api_results = self._search_api(clean_query, num_results)
|
| 44 |
+
if api_results:
|
| 45 |
+
results.extend(api_results)
|
| 46 |
+
logger.info(f"DuckDuckGo API found {len(api_results)} results")
|
| 47 |
+
|
| 48 |
+
# Strategy 3: Lite Interface (mobile-friendly)
|
| 49 |
+
if len(results) < num_results * 2:
|
| 50 |
+
lite_results = self._search_lite(clean_query, num_results)
|
| 51 |
+
if lite_results:
|
| 52 |
+
results.extend(lite_results)
|
| 53 |
+
logger.info(f"DuckDuckGo Lite found {len(lite_results)} results")
|
| 54 |
+
|
| 55 |
+
# If still no results, try with even simpler query
|
| 56 |
+
if not results:
|
| 57 |
+
simple_query = self._simplify_query(clean_query)
|
| 58 |
+
if simple_query != clean_query:
|
| 59 |
+
logger.info(f"Trying simplified query: '{simple_query}'")
|
| 60 |
+
html_results = self._search_html(simple_query, num_results * 2)
|
| 61 |
+
if html_results:
|
| 62 |
+
results.extend(html_results)
|
| 63 |
+
logger.info(f"Simplified query found {len(html_results)} results")
|
| 64 |
+
|
| 65 |
+
# If still no results, try fallback search engines
|
| 66 |
+
if not results:
|
| 67 |
+
logger.warning("DuckDuckGo failed, trying fallback search engines")
|
| 68 |
+
fallback_results = self._fallback_search(clean_query, num_results)
|
| 69 |
+
if fallback_results:
|
| 70 |
+
results.extend(fallback_results)
|
| 71 |
+
logger.info(f"Fallback search found {len(fallback_results)} results")
|
| 72 |
+
|
| 73 |
+
# Filter out irrelevant results first (less aggressive)
|
| 74 |
+
filtered_results = self._filter_irrelevant_sources(results)
|
| 75 |
+
logger.info(f"Filtered {len(results)} results to {len(filtered_results)} relevant results")
|
| 76 |
+
|
| 77 |
+
# If we have results, use reranker; otherwise return what we have
|
| 78 |
+
if filtered_results:
|
| 79 |
+
try:
|
| 80 |
+
reranked_results = self.reranker.rerank_results(clean_query, filtered_results, min_score)
|
| 81 |
+
logger.info(f"Reranked {len(filtered_results)} results to {len(reranked_results)} high-quality results")
|
| 82 |
+
|
| 83 |
+
# If reranking filtered out too many results, be more lenient
|
| 84 |
+
if len(reranked_results) < min(3, num_results) and len(filtered_results) > 0:
|
| 85 |
+
logger.warning(f"Reranking too strict ({len(reranked_results)} results), using fallback with lower threshold")
|
| 86 |
+
# Try with even lower threshold
|
| 87 |
+
fallback_results = self.reranker.rerank_results(clean_query, filtered_results, 0.05)
|
| 88 |
+
if len(fallback_results) > len(reranked_results):
|
| 89 |
+
return fallback_results[:num_results]
|
| 90 |
+
else:
|
| 91 |
+
# Last resort: return original filtered results with basic scoring
|
| 92 |
+
for i, result in enumerate(filtered_results[:num_results]):
|
| 93 |
+
result['composite_score'] = 0.5 - (i * 0.05) # Decreasing score
|
| 94 |
+
return filtered_results[:num_results]
|
| 95 |
+
|
| 96 |
+
return reranked_results[:num_results]
|
| 97 |
+
except Exception as e:
|
| 98 |
+
logger.warning(f"Reranking failed: {e}, returning filtered results")
|
| 99 |
+
return filtered_results[:num_results]
|
| 100 |
+
|
| 101 |
+
return filtered_results[:num_results]
|
| 102 |
+
|
| 103 |
+
def _clean_query(self, query: str) -> str:
|
| 104 |
+
"""Clean and normalize search query"""
|
| 105 |
+
if not query:
|
| 106 |
+
return ""
|
| 107 |
+
|
| 108 |
+
# Remove bullet points and special characters
|
| 109 |
+
import re
|
| 110 |
+
cleaned = re.sub(r'[•·▪▫‣⁃]', ' ', query) # Remove bullet points
|
| 111 |
+
cleaned = re.sub(r'[^\w\s\-\.]', ' ', cleaned) # Keep only alphanumeric, spaces, hyphens, dots
|
| 112 |
+
cleaned = re.sub(r'\s+', ' ', cleaned) # Normalize whitespace
|
| 113 |
+
cleaned = cleaned.strip()
|
| 114 |
+
|
| 115 |
+
# Remove common prefixes that might confuse search
|
| 116 |
+
prefixes_to_remove = [
|
| 117 |
+
r'^(en|vi|zh)\s*:\s*',
|
| 118 |
+
r'^(search|find|look for)\s+',
|
| 119 |
+
r'^(how to|what is|what are)\s+',
|
| 120 |
+
]
|
| 121 |
+
|
| 122 |
+
for prefix in prefixes_to_remove:
|
| 123 |
+
cleaned = re.sub(prefix, '', cleaned, flags=re.IGNORECASE)
|
| 124 |
+
|
| 125 |
+
return cleaned.strip()
|
| 126 |
+
|
| 127 |
+
def _simplify_query(self, query: str) -> str:
|
| 128 |
+
"""Simplify query to core medical terms"""
|
| 129 |
+
if not query:
|
| 130 |
+
return ""
|
| 131 |
+
|
| 132 |
+
# Extract key medical terms
|
| 133 |
+
import re
|
| 134 |
+
words = query.split()
|
| 135 |
+
|
| 136 |
+
# Keep medical keywords and important terms
|
| 137 |
+
medical_keywords = [
|
| 138 |
+
'migraine', 'headache', 'pain', 'treatment', 'therapy', 'medication', 'drug',
|
| 139 |
+
'chronic', 'acute', 'symptoms', 'diagnosis', 'prevention', 'management',
|
| 140 |
+
'disease', 'condition', 'syndrome', 'disorder', 'infection', 'inflammation',
|
| 141 |
+
'blood', 'heart', 'lung', 'brain', 'liver', 'kidney', 'diabetes', 'cancer',
|
| 142 |
+
'covid', 'flu', 'cold', 'fever', 'cough', 'breathing', 'chest', 'stomach'
|
| 143 |
+
]
|
| 144 |
+
|
| 145 |
+
# Keep words that are medical keywords or are important (longer than 3 chars)
|
| 146 |
+
important_words = []
|
| 147 |
+
for word in words:
|
| 148 |
+
word_lower = word.lower()
|
| 149 |
+
if word_lower in medical_keywords or len(word) > 3:
|
| 150 |
+
important_words.append(word)
|
| 151 |
+
|
| 152 |
+
# If we have important words, use them; otherwise use first few words
|
| 153 |
+
if important_words:
|
| 154 |
+
return ' '.join(important_words[:5]) # Max 5 words
|
| 155 |
+
else:
|
| 156 |
+
return ' '.join(words[:3]) # Max 3 words
|
| 157 |
+
|
| 158 |
+
def _filter_irrelevant_sources(self, results: List[Dict]) -> List[Dict]:
|
| 159 |
+
"""Filter out irrelevant sources like generic health pages, quizzes, etc."""
|
| 160 |
+
import re
|
| 161 |
+
filtered = []
|
| 162 |
+
|
| 163 |
+
# Only exclude obvious non-medical content
|
| 164 |
+
exclude_patterns = [
|
| 165 |
+
r'/quiz$', # Quiz pages (end of URL)
|
| 166 |
+
r'/test$', # Test pages (end of URL)
|
| 167 |
+
r'/assessment', # Assessment pages
|
| 168 |
+
r'/survey', # Survey pages
|
| 169 |
+
r'homepage|main page|index', # Homepage/index pages
|
| 170 |
+
r'login|sign.up|register', # Auth pages
|
| 171 |
+
r'contact|about.us|privacy', # Info pages
|
| 172 |
+
r'subscribe|newsletter|rss', # Subscription pages
|
| 173 |
+
r'sitemap', # Navigation pages
|
| 174 |
+
]
|
| 175 |
+
|
| 176 |
+
for result in results:
|
| 177 |
+
url = result.get('url', '').lower()
|
| 178 |
+
title = result.get('title', '').lower()
|
| 179 |
+
|
| 180 |
+
# Skip if matches exclude patterns
|
| 181 |
+
should_exclude = False
|
| 182 |
+
for pattern in exclude_patterns:
|
| 183 |
+
if re.search(pattern, url) or re.search(pattern, title):
|
| 184 |
+
should_exclude = True
|
| 185 |
+
logger.debug(f"Excluding irrelevant source: {url}")
|
| 186 |
+
break
|
| 187 |
+
|
| 188 |
+
if not should_exclude:
|
| 189 |
+
filtered.append(result)
|
| 190 |
+
|
| 191 |
+
# If we filtered out too many, be less aggressive
|
| 192 |
+
if len(filtered) < len(results) * 0.3: # If we kept less than 30%
|
| 193 |
+
logger.warning(f"Filtering too aggressive, keeping more results: {len(results)} -> {len(filtered)}")
|
| 194 |
+
# Return original results with minimal filtering
|
| 195 |
+
minimal_filtered = []
|
| 196 |
+
for result in results:
|
| 197 |
+
url = result.get('url', '').lower()
|
| 198 |
+
if not any(re.search(pattern, url) for pattern in [r'login', r'sign.up', r'register']):
|
| 199 |
+
minimal_filtered.append(result)
|
| 200 |
+
return minimal_filtered
|
| 201 |
+
|
| 202 |
+
return filtered
|
| 203 |
+
|
| 204 |
+
def _search_html(self, query: str, num_results: int) -> List[Dict]:
|
| 205 |
+
"""Search using DuckDuckGo HTML interface with better error handling"""
|
| 206 |
+
try:
|
| 207 |
+
# Try multiple DuckDuckGo endpoints
|
| 208 |
+
endpoints = [
|
| 209 |
+
{
|
| 210 |
+
'url': 'https://html.duckduckgo.com/html/',
|
| 211 |
+
'params': {
|
| 212 |
+
'q': query,
|
| 213 |
+
'kl': 'us-en',
|
| 214 |
+
's': '0',
|
| 215 |
+
'dc': '1',
|
| 216 |
+
'v': 'l'
|
| 217 |
+
}
|
| 218 |
+
},
|
| 219 |
+
{
|
| 220 |
+
'url': 'https://lite.duckduckgo.com/lite/',
|
| 221 |
+
'params': {
|
| 222 |
+
'q': query,
|
| 223 |
+
'kl': 'us-en'
|
| 224 |
+
}
|
| 225 |
+
},
|
| 226 |
+
{
|
| 227 |
+
'url': 'https://duckduckgo.com/html/',
|
| 228 |
+
'params': {
|
| 229 |
+
'q': query,
|
| 230 |
+
'kl': 'us-en'
|
| 231 |
+
}
|
| 232 |
+
}
|
| 233 |
+
]
|
| 234 |
+
|
| 235 |
+
for endpoint in endpoints:
|
| 236 |
+
try:
|
| 237 |
+
# Add random delay to avoid rate limiting
|
| 238 |
+
import time
|
| 239 |
+
time.sleep(0.5)
|
| 240 |
+
|
| 241 |
+
# Update headers to look more like a real browser
|
| 242 |
+
headers = self.session.headers.copy()
|
| 243 |
+
headers.update({
|
| 244 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 245 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 246 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 247 |
+
'DNT': '1',
|
| 248 |
+
'Connection': 'keep-alive',
|
| 249 |
+
'Upgrade-Insecure-Requests': '1',
|
| 250 |
+
})
|
| 251 |
+
|
| 252 |
+
response = self.session.get(
|
| 253 |
+
endpoint['url'],
|
| 254 |
+
params=endpoint['params'],
|
| 255 |
+
headers=headers,
|
| 256 |
+
timeout=self.timeout
|
| 257 |
+
)
|
| 258 |
+
|
| 259 |
+
if response.status_code == 403:
|
| 260 |
+
logger.warning(f"DuckDuckGo endpoint {endpoint['url']} returned 403, trying next...")
|
| 261 |
+
continue
|
| 262 |
+
elif response.status_code == 429:
|
| 263 |
+
logger.warning(f"DuckDuckGo rate limited, waiting...")
|
| 264 |
+
time.sleep(2)
|
| 265 |
+
continue
|
| 266 |
+
|
| 267 |
+
except Exception as e:
|
| 268 |
+
logger.warning(f"DuckDuckGo endpoint {endpoint['url']} failed: {e}")
|
| 269 |
+
if endpoint == endpoints[-1]: # Last endpoint
|
| 270 |
+
raise e
|
| 271 |
+
continue
|
| 272 |
+
else:
|
| 273 |
+
# All endpoints failed
|
| 274 |
+
logger.error("All DuckDuckGo endpoints failed")
|
| 275 |
+
return []
|
| 276 |
+
|
| 277 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 278 |
+
results = []
|
| 279 |
+
|
| 280 |
+
# Multiple selectors for different DDG layouts
|
| 281 |
+
selectors = [
|
| 282 |
+
'a.result__a',
|
| 283 |
+
'a[data-testid="result-title-a"]',
|
| 284 |
+
'.result__title a',
|
| 285 |
+
'.web-result a',
|
| 286 |
+
'.result a',
|
| 287 |
+
'a[href*="http"]:not([href*="duckduckgo.com"])'
|
| 288 |
+
]
|
| 289 |
+
|
| 290 |
+
for selector in selectors:
|
| 291 |
+
links = soup.select(selector)
|
| 292 |
+
if links:
|
| 293 |
+
logger.info(f"Using selector: {selector} - found {len(links)} links")
|
| 294 |
+
break
|
| 295 |
+
|
| 296 |
+
for link in links[:num_results]:
|
| 297 |
+
try:
|
| 298 |
+
href = link.get('href')
|
| 299 |
+
if not href or href.startswith('#') or 'duckduckgo.com' in href:
|
| 300 |
+
continue
|
| 301 |
+
|
| 302 |
+
# Clean up DDG redirect URLs
|
| 303 |
+
if href.startswith('/l/?uddg='):
|
| 304 |
+
import urllib.parse
|
| 305 |
+
href = urllib.parse.unquote(href.split('uddg=')[1])
|
| 306 |
+
|
| 307 |
+
title = link.get_text(strip=True)
|
| 308 |
+
if title and href.startswith('http'):
|
| 309 |
+
results.append({
|
| 310 |
+
'url': href,
|
| 311 |
+
'title': title,
|
| 312 |
+
'source': 'duckduckgo_html'
|
| 313 |
+
})
|
| 314 |
+
except Exception as e:
|
| 315 |
+
logger.debug(f"Error parsing link: {e}")
|
| 316 |
+
continue
|
| 317 |
+
|
| 318 |
+
return results
|
| 319 |
+
|
| 320 |
+
except Exception as e:
|
| 321 |
+
logger.warning(f"DuckDuckGo HTML search failed: {e}")
|
| 322 |
+
return []
|
| 323 |
+
|
| 324 |
+
def _search_api(self, query: str, num_results: int) -> List[Dict]:
|
| 325 |
+
"""Search using DuckDuckGo Instant Answer API"""
|
| 326 |
+
try:
|
| 327 |
+
url = "https://api.duckduckgo.com/"
|
| 328 |
+
params = {
|
| 329 |
+
'q': query,
|
| 330 |
+
'format': 'json',
|
| 331 |
+
'no_html': '1',
|
| 332 |
+
'skip_disambig': '1',
|
| 333 |
+
't': 'MedicalChatbot'
|
| 334 |
+
}
|
| 335 |
+
|
| 336 |
+
response = self.session.get(url, params=params, timeout=self.timeout)
|
| 337 |
+
response.raise_for_status()
|
| 338 |
+
data = response.json()
|
| 339 |
+
|
| 340 |
+
results = []
|
| 341 |
+
|
| 342 |
+
# Abstract result
|
| 343 |
+
if data.get('AbstractURL') and data.get('Abstract'):
|
| 344 |
+
results.append({
|
| 345 |
+
'url': data['AbstractURL'],
|
| 346 |
+
'title': data.get('Heading', query),
|
| 347 |
+
'content': data.get('Abstract', ''),
|
| 348 |
+
'source': 'duckduckgo_api'
|
| 349 |
+
})
|
| 350 |
+
|
| 351 |
+
# Related topics
|
| 352 |
+
for topic in data.get('RelatedTopics', []):
|
| 353 |
+
if len(results) >= num_results:
|
| 354 |
+
break
|
| 355 |
+
|
| 356 |
+
if isinstance(topic, dict) and topic.get('FirstURL'):
|
| 357 |
+
text = topic.get('Text', '')
|
| 358 |
+
title = text.split(' - ')[0] if ' - ' in text else text[:50]
|
| 359 |
+
|
| 360 |
+
results.append({
|
| 361 |
+
'url': topic['FirstURL'],
|
| 362 |
+
'title': title,
|
| 363 |
+
'content': text,
|
| 364 |
+
'source': 'duckduckgo_api'
|
| 365 |
+
})
|
| 366 |
+
|
| 367 |
+
return results
|
| 368 |
+
|
| 369 |
+
except Exception as e:
|
| 370 |
+
logger.warning(f"DuckDuckGo API search failed: {e}")
|
| 371 |
+
return []
|
| 372 |
+
|
| 373 |
+
def _search_lite(self, query: str, num_results: int) -> List[Dict]:
|
| 374 |
+
"""Search using DuckDuckGo Lite interface"""
|
| 375 |
+
try:
|
| 376 |
+
url = "https://lite.duckduckgo.com/lite/"
|
| 377 |
+
params = {
|
| 378 |
+
'q': query,
|
| 379 |
+
'kl': 'us-en'
|
| 380 |
+
}
|
| 381 |
+
|
| 382 |
+
response = self.session.get(url, params=params, timeout=self.timeout)
|
| 383 |
+
response.raise_for_status()
|
| 384 |
+
|
| 385 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 386 |
+
results = []
|
| 387 |
+
|
| 388 |
+
# Lite interface selectors
|
| 389 |
+
links = soup.select('a[href*="http"]:not([href*="duckduckgo.com"])')
|
| 390 |
+
|
| 391 |
+
for link in links[:num_results]:
|
| 392 |
+
try:
|
| 393 |
+
href = link.get('href')
|
| 394 |
+
title = link.get_text(strip=True)
|
| 395 |
+
|
| 396 |
+
if href and title and href.startswith('http'):
|
| 397 |
+
results.append({
|
| 398 |
+
'url': href,
|
| 399 |
+
'title': title,
|
| 400 |
+
'source': 'duckduckgo_lite'
|
| 401 |
+
})
|
| 402 |
+
except Exception as e:
|
| 403 |
+
logger.debug(f"Error parsing lite link: {e}")
|
| 404 |
+
continue
|
| 405 |
+
|
| 406 |
+
return results
|
| 407 |
+
|
| 408 |
+
except Exception as e:
|
| 409 |
+
logger.warning(f"DuckDuckGo Lite search failed: {e}")
|
| 410 |
+
return []
|
| 411 |
+
|
| 412 |
+
def _fallback_search(self, query: str, num_results: int) -> List[Dict]:
|
| 413 |
+
"""Fallback search using alternative methods when DuckDuckGo fails"""
|
| 414 |
+
results = []
|
| 415 |
+
|
| 416 |
+
# Try Bing search as fallback
|
| 417 |
+
try:
|
| 418 |
+
bing_results = self._search_bing(query, num_results)
|
| 419 |
+
if bing_results:
|
| 420 |
+
results.extend(bing_results)
|
| 421 |
+
logger.info(f"Bing fallback found {len(bing_results)} results")
|
| 422 |
+
except Exception as e:
|
| 423 |
+
logger.warning(f"Bing fallback failed: {e}")
|
| 424 |
+
|
| 425 |
+
# Try Startpage search as fallback
|
| 426 |
+
try:
|
| 427 |
+
startpage_results = self._search_startpage(query, num_results)
|
| 428 |
+
if startpage_results:
|
| 429 |
+
results.extend(startpage_results)
|
| 430 |
+
logger.info(f"Startpage fallback found {len(startpage_results)} results")
|
| 431 |
+
except Exception as e:
|
| 432 |
+
logger.warning(f"Startpage fallback failed: {e}")
|
| 433 |
+
|
| 434 |
+
# Try Searx instances as fallback
|
| 435 |
+
try:
|
| 436 |
+
searx_results = self._search_searx(query, num_results)
|
| 437 |
+
if searx_results:
|
| 438 |
+
results.extend(searx_results)
|
| 439 |
+
logger.info(f"Searx fallback found {len(searx_results)} results")
|
| 440 |
+
except Exception as e:
|
| 441 |
+
logger.warning(f"Searx fallback failed: {e}")
|
| 442 |
+
|
| 443 |
+
return results
|
| 444 |
+
|
| 445 |
+
def _search_bing(self, query: str, num_results: int) -> List[Dict]:
|
| 446 |
+
"""Search using Bing as fallback"""
|
| 447 |
+
try:
|
| 448 |
+
url = "https://www.bing.com/search"
|
| 449 |
+
params = {
|
| 450 |
+
'q': query,
|
| 451 |
+
'count': min(num_results, 50),
|
| 452 |
+
'first': 1
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
headers = self.session.headers.copy()
|
| 456 |
+
headers.update({
|
| 457 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 458 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 459 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
| 460 |
+
})
|
| 461 |
+
|
| 462 |
+
response = self.session.get(url, params=params, headers=headers, timeout=self.timeout)
|
| 463 |
+
response.raise_for_status()
|
| 464 |
+
|
| 465 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 466 |
+
results = []
|
| 467 |
+
|
| 468 |
+
# Bing result selectors
|
| 469 |
+
selectors = [
|
| 470 |
+
'h2 a',
|
| 471 |
+
'.b_title a',
|
| 472 |
+
'.b_algo a'
|
| 473 |
+
]
|
| 474 |
+
|
| 475 |
+
for selector in selectors:
|
| 476 |
+
links = soup.select(selector)
|
| 477 |
+
if links:
|
| 478 |
+
logger.info(f"Bing found {len(links)} links with selector: {selector}")
|
| 479 |
+
break
|
| 480 |
+
|
| 481 |
+
for link in links[:num_results]:
|
| 482 |
+
try:
|
| 483 |
+
href = link.get('href')
|
| 484 |
+
if not href or href.startswith('#') or 'bing.com' in href:
|
| 485 |
+
continue
|
| 486 |
+
|
| 487 |
+
title = link.get_text(strip=True)
|
| 488 |
+
if title and href.startswith('http'):
|
| 489 |
+
results.append({
|
| 490 |
+
'url': href,
|
| 491 |
+
'title': title,
|
| 492 |
+
'source': 'bing_fallback'
|
| 493 |
+
})
|
| 494 |
+
except Exception as e:
|
| 495 |
+
logger.debug(f"Error parsing Bing link: {e}")
|
| 496 |
+
continue
|
| 497 |
+
|
| 498 |
+
return results
|
| 499 |
+
|
| 500 |
+
except Exception as e:
|
| 501 |
+
logger.warning(f"Bing search failed: {e}")
|
| 502 |
+
return []
|
| 503 |
+
|
| 504 |
+
def _search_startpage(self, query: str, num_results: int) -> List[Dict]:
|
| 505 |
+
"""Search using Startpage as fallback"""
|
| 506 |
+
try:
|
| 507 |
+
url = "https://www.startpage.com/sp/search"
|
| 508 |
+
params = {
|
| 509 |
+
'query': query,
|
| 510 |
+
'cat': 'web',
|
| 511 |
+
'pl': 'opensearch'
|
| 512 |
+
}
|
| 513 |
+
|
| 514 |
+
headers = self.session.headers.copy()
|
| 515 |
+
headers.update({
|
| 516 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 517 |
+
'Accept-Language': 'en-US,en;q=0.5'
|
| 518 |
+
})
|
| 519 |
+
|
| 520 |
+
response = self.session.get(url, params=params, headers=headers, timeout=self.timeout)
|
| 521 |
+
response.raise_for_status()
|
| 522 |
+
|
| 523 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 524 |
+
results = []
|
| 525 |
+
|
| 526 |
+
# Startpage result selectors
|
| 527 |
+
links = soup.select('a[href*="http"]:not([href*="startpage.com"])')
|
| 528 |
+
|
| 529 |
+
for link in links[:num_results]:
|
| 530 |
+
try:
|
| 531 |
+
href = link.get('href')
|
| 532 |
+
if not href or href.startswith('#') or 'startpage.com' in href:
|
| 533 |
+
continue
|
| 534 |
+
|
| 535 |
+
title = link.get_text(strip=True)
|
| 536 |
+
if title and href.startswith('http'):
|
| 537 |
+
results.append({
|
| 538 |
+
'url': href,
|
| 539 |
+
'title': title,
|
| 540 |
+
'source': 'startpage_fallback'
|
| 541 |
+
})
|
| 542 |
+
except Exception as e:
|
| 543 |
+
logger.debug(f"Error parsing Startpage link: {e}")
|
| 544 |
+
continue
|
| 545 |
+
|
| 546 |
+
return results
|
| 547 |
+
|
| 548 |
+
except Exception as e:
|
| 549 |
+
logger.warning(f"Startpage search failed: {e}")
|
| 550 |
+
return []
|
| 551 |
+
|
| 552 |
+
def _search_searx(self, query: str, num_results: int) -> List[Dict]:
|
| 553 |
+
"""Search using public Searx instances as fallback"""
|
| 554 |
+
searx_instances = [
|
| 555 |
+
"https://searx.be",
|
| 556 |
+
"https://searx.tiekoetter.com",
|
| 557 |
+
"https://searx.xyz"
|
| 558 |
+
]
|
| 559 |
+
|
| 560 |
+
for instance in searx_instances:
|
| 561 |
+
try:
|
| 562 |
+
url = f"{instance}/search"
|
| 563 |
+
params = {
|
| 564 |
+
'q': query,
|
| 565 |
+
'format': 'json'
|
| 566 |
+
}
|
| 567 |
+
|
| 568 |
+
response = self.session.get(url, params=params, timeout=self.timeout)
|
| 569 |
+
response.raise_for_status()
|
| 570 |
+
|
| 571 |
+
data = response.json()
|
| 572 |
+
results = []
|
| 573 |
+
|
| 574 |
+
for result in data.get('results', [])[:num_results]:
|
| 575 |
+
try:
|
| 576 |
+
url = result.get('url', '')
|
| 577 |
+
title = result.get('title', '')
|
| 578 |
+
content = result.get('content', '')
|
| 579 |
+
|
| 580 |
+
if url and title and url.startswith('http'):
|
| 581 |
+
results.append({
|
| 582 |
+
'url': url,
|
| 583 |
+
'title': title,
|
| 584 |
+
'content': content,
|
| 585 |
+
'source': 'searx_fallback'
|
| 586 |
+
})
|
| 587 |
+
except Exception as e:
|
| 588 |
+
logger.debug(f"Error parsing Searx result: {e}")
|
| 589 |
+
continue
|
| 590 |
+
|
| 591 |
+
if results:
|
| 592 |
+
logger.info(f"Searx instance {instance} found {len(results)} results")
|
| 593 |
+
return results
|
| 594 |
+
|
| 595 |
+
except Exception as e:
|
| 596 |
+
logger.debug(f"Searx instance {instance} failed: {e}")
|
| 597 |
+
continue
|
| 598 |
+
|
| 599 |
+
return []
|
search/engines/multilingual.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict, Optional
|
| 5 |
+
import time
|
| 6 |
+
import re
|
| 7 |
+
from urllib.parse import urlparse, quote
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class MultilingualCookingEngine:
|
| 12 |
+
"""Multilingual cooking search engine supporting English, Vietnamese, and Chinese sources"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, timeout: int = 15):
|
| 15 |
+
self.session = requests.Session()
|
| 16 |
+
self.session.headers.update({
|
| 17 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 18 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 19 |
+
'Accept-Language': 'en-US,en;q=0.5,vi;q=0.3,zh-CN;q=0.3',
|
| 20 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 21 |
+
'Connection': 'keep-alive',
|
| 22 |
+
})
|
| 23 |
+
self.timeout = timeout
|
| 24 |
+
|
| 25 |
+
# Comprehensive cooking sources by language
|
| 26 |
+
self.cooking_sources = {
|
| 27 |
+
'en': {
|
| 28 |
+
# Major Cooking Sources
|
| 29 |
+
'allrecipes': {
|
| 30 |
+
'base_url': 'https://www.allrecipes.com',
|
| 31 |
+
'search_url': 'https://www.allrecipes.com/search',
|
| 32 |
+
'domains': ['allrecipes.com'],
|
| 33 |
+
'selectors': ['a[href*="/recipe/"]', 'a[href*="/recipes/"]', '.search-result a']
|
| 34 |
+
},
|
| 35 |
+
'food_network': {
|
| 36 |
+
'base_url': 'https://www.foodnetwork.com',
|
| 37 |
+
'search_url': 'https://www.foodnetwork.com/search',
|
| 38 |
+
'domains': ['foodnetwork.com'],
|
| 39 |
+
'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
|
| 40 |
+
},
|
| 41 |
+
'epicurious': {
|
| 42 |
+
'base_url': 'https://www.epicurious.com',
|
| 43 |
+
'search_url': 'https://www.epicurious.com/search',
|
| 44 |
+
'domains': ['epicurious.com'],
|
| 45 |
+
'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
|
| 46 |
+
},
|
| 47 |
+
'serious_eats': {
|
| 48 |
+
'base_url': 'https://www.seriouseats.com',
|
| 49 |
+
'search_url': 'https://www.seriouseats.com/search',
|
| 50 |
+
'domains': ['seriouseats.com'],
|
| 51 |
+
'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
|
| 52 |
+
},
|
| 53 |
+
'bon_appetit': {
|
| 54 |
+
'base_url': 'https://www.bonappetit.com',
|
| 55 |
+
'search_url': 'https://www.bonappetit.com/search',
|
| 56 |
+
'domains': ['bonappetit.com'],
|
| 57 |
+
'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
|
| 58 |
+
},
|
| 59 |
+
'taste_of_home': {
|
| 60 |
+
'base_url': 'https://www.tasteofhome.com',
|
| 61 |
+
'search_url': 'https://www.tasteofhome.com/search',
|
| 62 |
+
'domains': ['tasteofhome.com'],
|
| 63 |
+
'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
|
| 64 |
+
},
|
| 65 |
+
'food_com': {
|
| 66 |
+
'base_url': 'https://www.food.com',
|
| 67 |
+
'search_url': 'https://www.food.com/search',
|
| 68 |
+
'domains': ['food.com'],
|
| 69 |
+
'selectors': ['a[href*="/recipes/"]', 'a[href*="/recipe/"]', '.search-result a']
|
| 70 |
+
}
|
| 71 |
+
},
|
| 72 |
+
'vi': {
|
| 73 |
+
# Vietnamese Cooking Sources
|
| 74 |
+
'mon_ngon_viet': {
|
| 75 |
+
'base_url': 'https://monngonviet.com',
|
| 76 |
+
'search_url': 'https://monngonviet.com/tim-kiem',
|
| 77 |
+
'domains': ['monngonviet.com'],
|
| 78 |
+
'selectors': ['a[href*="/cong-thuc/"]', 'a[href*="/mon-an/"]', '.search-result a']
|
| 79 |
+
},
|
| 80 |
+
'day_phong_cach': {
|
| 81 |
+
'base_url': 'https://dayphongcach.vn',
|
| 82 |
+
'search_url': 'https://dayphongcach.vn/tim-kiem',
|
| 83 |
+
'domains': ['dayphongcach.vn'],
|
| 84 |
+
'selectors': ['a[href*="/mon-an/"]', 'a[href*="/cong-thuc/"]', '.search-result a']
|
| 85 |
+
},
|
| 86 |
+
'am_thuc_viet': {
|
| 87 |
+
'base_url': 'https://amthucviet.vn',
|
| 88 |
+
'search_url': 'https://amthucviet.vn/tim-kiem',
|
| 89 |
+
'domains': ['amthucviet.vn'],
|
| 90 |
+
'selectors': ['a[href*="/mon-an/"]', 'a[href*="/cong-thuc/"]', '.search-result a']
|
| 91 |
+
}
|
| 92 |
+
},
|
| 93 |
+
'zh': {
|
| 94 |
+
# Chinese Cooking Sources
|
| 95 |
+
'xiachufang': {
|
| 96 |
+
'base_url': 'https://www.xiachufang.com',
|
| 97 |
+
'search_url': 'https://www.xiachufang.com/search',
|
| 98 |
+
'domains': ['xiachufang.com'],
|
| 99 |
+
'selectors': ['a[href*="/recipe/"]', 'a[href*="/cook/"]', '.search-result a']
|
| 100 |
+
},
|
| 101 |
+
'douguo': {
|
| 102 |
+
'base_url': 'https://www.douguo.com',
|
| 103 |
+
'search_url': 'https://www.douguo.com/search',
|
| 104 |
+
'domains': ['douguo.com'],
|
| 105 |
+
'selectors': ['a[href*="/recipe/"]', 'a[href*="/cook/"]', '.search-result a']
|
| 106 |
+
},
|
| 107 |
+
'meishij': {
|
| 108 |
+
'base_url': 'https://www.meishij.net',
|
| 109 |
+
'search_url': 'https://www.meishij.net/search',
|
| 110 |
+
'domains': ['meishij.net'],
|
| 111 |
+
'selectors': ['a[href*="/recipe/"]', 'a[href*="/cook/"]', '.search-result a']
|
| 112 |
+
}
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
def search(self, query: str, num_results: int = 10, languages: List[str] = None) -> List[Dict]:
|
| 117 |
+
"""Search across multiple languages and cooking sources"""
|
| 118 |
+
if languages is None:
|
| 119 |
+
languages = ['en', 'vi', 'zh']
|
| 120 |
+
|
| 121 |
+
all_results = []
|
| 122 |
+
|
| 123 |
+
for lang in languages:
|
| 124 |
+
if lang in self.cooking_sources:
|
| 125 |
+
lang_results = self._search_language_sources(query, lang, num_results // len(languages))
|
| 126 |
+
all_results.extend(lang_results)
|
| 127 |
+
time.sleep(0.5) # Rate limiting between languages
|
| 128 |
+
|
| 129 |
+
return all_results[:num_results]
|
| 130 |
+
|
| 131 |
+
def _search_language_sources(self, query: str, language: str, num_results: int) -> List[Dict]:
|
| 132 |
+
"""Search sources for a specific language"""
|
| 133 |
+
results = []
|
| 134 |
+
sources = self.cooking_sources.get(language, {})
|
| 135 |
+
|
| 136 |
+
for source_name, source_config in sources.items():
|
| 137 |
+
if len(results) >= num_results:
|
| 138 |
+
break
|
| 139 |
+
|
| 140 |
+
source_results = self._search_source(query, source_name, source_config, language)
|
| 141 |
+
results.extend(source_results)
|
| 142 |
+
time.sleep(0.3) # Rate limiting
|
| 143 |
+
|
| 144 |
+
return results
|
| 145 |
+
|
| 146 |
+
def _search_source(self, query: str, source_name: str, source_config: Dict, language: str) -> List[Dict]:
|
| 147 |
+
"""Search a specific cooking source"""
|
| 148 |
+
try:
|
| 149 |
+
search_url = source_config.get('search_url')
|
| 150 |
+
if not search_url:
|
| 151 |
+
return []
|
| 152 |
+
|
| 153 |
+
params = {
|
| 154 |
+
'q': query,
|
| 155 |
+
'query': query,
|
| 156 |
+
'search': query,
|
| 157 |
+
'keyword': query
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
response = self.session.get(search_url, params=params, timeout=self.timeout)
|
| 161 |
+
response.raise_for_status()
|
| 162 |
+
|
| 163 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 164 |
+
results = []
|
| 165 |
+
|
| 166 |
+
# Source-specific selectors
|
| 167 |
+
selectors = source_config.get('selectors', ['a[href*="http"]'])
|
| 168 |
+
|
| 169 |
+
for selector in selectors:
|
| 170 |
+
links = soup.select(selector)
|
| 171 |
+
if links:
|
| 172 |
+
logger.info(f"{source_name} found {len(links)} results with selector: {selector}")
|
| 173 |
+
break
|
| 174 |
+
|
| 175 |
+
for link in links[:3]: # Limit per source
|
| 176 |
+
try:
|
| 177 |
+
href = link.get('href')
|
| 178 |
+
if not href:
|
| 179 |
+
continue
|
| 180 |
+
|
| 181 |
+
# Make absolute URL
|
| 182 |
+
if href.startswith('/'):
|
| 183 |
+
href = source_config['base_url'] + href
|
| 184 |
+
|
| 185 |
+
title = link.get_text(strip=True)
|
| 186 |
+
if title and href.startswith('http'):
|
| 187 |
+
results.append({
|
| 188 |
+
'url': href,
|
| 189 |
+
'title': title,
|
| 190 |
+
'source': source_name,
|
| 191 |
+
'domain': source_config['domains'][0],
|
| 192 |
+
'language': language
|
| 193 |
+
})
|
| 194 |
+
except Exception as e:
|
| 195 |
+
logger.debug(f"Error parsing {source_name} link: {e}")
|
| 196 |
+
continue
|
| 197 |
+
|
| 198 |
+
return results
|
| 199 |
+
|
| 200 |
+
except Exception as e:
|
| 201 |
+
logger.warning(f"Cooking source {source_name} ({language}) search failed: {e}")
|
| 202 |
+
return []
|
| 203 |
+
|
| 204 |
+
def search_by_language(self, query: str, language: str, num_results: int = 10) -> List[Dict]:
|
| 205 |
+
"""Search sources for a specific language only"""
|
| 206 |
+
if language not in self.cooking_sources:
|
| 207 |
+
logger.warning(f"Language {language} not supported")
|
| 208 |
+
return []
|
| 209 |
+
|
| 210 |
+
return self._search_language_sources(query, language, num_results)
|
| 211 |
+
|
| 212 |
+
def _get_fallback_sources(self, query: str, language: str, num_results: int) -> List[Dict]:
|
| 213 |
+
"""Get fallback cooking sources when direct search fails"""
|
| 214 |
+
fallback_sources = {
|
| 215 |
+
'en': [
|
| 216 |
+
{
|
| 217 |
+
'url': 'https://www.allrecipes.com/recipes',
|
| 218 |
+
'title': f'AllRecipes: {query}',
|
| 219 |
+
'source': 'allrecipes_fallback',
|
| 220 |
+
'language': 'en',
|
| 221 |
+
'domain': 'allrecipes.com'
|
| 222 |
+
},
|
| 223 |
+
{
|
| 224 |
+
'url': 'https://www.foodnetwork.com/recipes',
|
| 225 |
+
'title': f'Food Network: {query}',
|
| 226 |
+
'source': 'foodnetwork_fallback',
|
| 227 |
+
'language': 'en',
|
| 228 |
+
'domain': 'foodnetwork.com'
|
| 229 |
+
},
|
| 230 |
+
{
|
| 231 |
+
'url': 'https://www.epicurious.com/recipes-menus',
|
| 232 |
+
'title': f'Epicurious: {query}',
|
| 233 |
+
'source': 'epicurious_fallback',
|
| 234 |
+
'language': 'en',
|
| 235 |
+
'domain': 'epicurious.com'
|
| 236 |
+
}
|
| 237 |
+
],
|
| 238 |
+
'vi': [
|
| 239 |
+
{
|
| 240 |
+
'url': 'https://monngonviet.com/cong-thuc',
|
| 241 |
+
'title': f'Món Ngon Việt: {query}',
|
| 242 |
+
'source': 'monngonviet_fallback',
|
| 243 |
+
'language': 'vi',
|
| 244 |
+
'domain': 'monngonviet.com'
|
| 245 |
+
},
|
| 246 |
+
{
|
| 247 |
+
'url': 'https://dayphongcach.vn/mon-an',
|
| 248 |
+
'title': f'Dạy Phong Cách: {query}',
|
| 249 |
+
'source': 'dayphongcach_fallback',
|
| 250 |
+
'language': 'vi',
|
| 251 |
+
'domain': 'dayphongcach.vn'
|
| 252 |
+
}
|
| 253 |
+
],
|
| 254 |
+
'zh': [
|
| 255 |
+
{
|
| 256 |
+
'url': 'https://www.xiachufang.com/recipe',
|
| 257 |
+
'title': f'下厨房: {query}',
|
| 258 |
+
'source': 'xiachufang_fallback',
|
| 259 |
+
'language': 'zh',
|
| 260 |
+
'domain': 'xiachufang.com'
|
| 261 |
+
},
|
| 262 |
+
{
|
| 263 |
+
'url': 'https://www.douguo.com/recipe',
|
| 264 |
+
'title': f'豆果: {query}',
|
| 265 |
+
'source': 'douguo_fallback',
|
| 266 |
+
'language': 'zh',
|
| 267 |
+
'domain': 'douguo.com'
|
| 268 |
+
}
|
| 269 |
+
]
|
| 270 |
+
}
|
| 271 |
+
|
| 272 |
+
return fallback_sources.get(language, [])[:num_results]
|
search/engines/video.py
ADDED
|
@@ -0,0 +1,432 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import logging
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import time
|
| 6 |
+
import re
|
| 7 |
+
from urllib.parse import urlparse, quote
|
| 8 |
+
from models.reranker import MedicalReranker
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
class VideoSearchEngine:
|
| 13 |
+
"""Search engine for medical videos across multiple platforms"""
|
| 14 |
+
|
| 15 |
+
def __init__(self, timeout: int = 15):
|
| 16 |
+
self.session = requests.Session()
|
| 17 |
+
self.session.headers.update({
|
| 18 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 19 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 20 |
+
'Accept-Language': 'en-US,en;q=0.5,vi;q=0.3,zh-CN;q=0.3',
|
| 21 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 22 |
+
'Connection': 'keep-alive',
|
| 23 |
+
})
|
| 24 |
+
self.timeout = timeout
|
| 25 |
+
self.reranker = MedicalReranker()
|
| 26 |
+
|
| 27 |
+
# Video platforms by language
|
| 28 |
+
self.video_platforms = {
|
| 29 |
+
'en': [
|
| 30 |
+
{
|
| 31 |
+
'name': 'youtube',
|
| 32 |
+
'search_url': 'https://www.youtube.com/results',
|
| 33 |
+
'params': {'search_query': ''},
|
| 34 |
+
'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
|
| 35 |
+
'base_url': 'https://www.youtube.com'
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
'name': 'medscape_videos',
|
| 39 |
+
'search_url': 'https://www.medscape.com/search',
|
| 40 |
+
'params': {'q': ''},
|
| 41 |
+
'selectors': ['a[href*="/video/"]', 'a[href*="/viewarticle/"]'],
|
| 42 |
+
'base_url': 'https://www.medscape.com'
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
'vi': [
|
| 46 |
+
{
|
| 47 |
+
'name': 'youtube_vi',
|
| 48 |
+
'search_url': 'https://www.youtube.com/results',
|
| 49 |
+
'params': {'search_query': ''},
|
| 50 |
+
'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
|
| 51 |
+
'base_url': 'https://www.youtube.com'
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
'name': 'vinmec_videos',
|
| 55 |
+
'search_url': 'https://www.vinmec.com/vi/tim-kiem',
|
| 56 |
+
'params': {'q': ''},
|
| 57 |
+
'selectors': ['a[href*="/video/"]', 'a[href*="/suc-khoe/"]'],
|
| 58 |
+
'base_url': 'https://www.vinmec.com'
|
| 59 |
+
}
|
| 60 |
+
],
|
| 61 |
+
'zh': [
|
| 62 |
+
{
|
| 63 |
+
'name': 'youtube_zh',
|
| 64 |
+
'search_url': 'https://www.youtube.com/results',
|
| 65 |
+
'params': {'search_query': ''},
|
| 66 |
+
'selectors': ['a#video-title', 'a[href*="/watch?v="]'],
|
| 67 |
+
'base_url': 'https://www.youtube.com'
|
| 68 |
+
},
|
| 69 |
+
{
|
| 70 |
+
'name': 'haodf_videos',
|
| 71 |
+
'search_url': 'https://www.haodf.com/search',
|
| 72 |
+
'params': {'q': ''},
|
| 73 |
+
'selectors': ['a[href*="/video/"]', 'a[href*="/jibing/"]'],
|
| 74 |
+
'base_url': 'https://www.haodf.com'
|
| 75 |
+
}
|
| 76 |
+
]
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
def _normalize_query(self, q: str) -> str:
|
| 80 |
+
if not q:
|
| 81 |
+
return ""
|
| 82 |
+
q = q.strip()
|
| 83 |
+
q = re.sub(r"^(en|vi|zh)\s*:\s*", "", q, flags=re.IGNORECASE)
|
| 84 |
+
# Remove bullet points and special characters
|
| 85 |
+
q = re.sub(r'[•·▪▫‣⁃]', ' ', q)
|
| 86 |
+
q = re.sub(r'[^\w\s\-\.]', ' ', q)
|
| 87 |
+
q = re.sub(r"\s+", " ", q)
|
| 88 |
+
return q.strip()
|
| 89 |
+
|
| 90 |
+
def _is_valid_medical_video(self, result: Dict, query: str) -> bool:
|
| 91 |
+
"""Check if video is medically relevant and has valid URL"""
|
| 92 |
+
url = result.get('url', '')
|
| 93 |
+
title = result.get('title', '')
|
| 94 |
+
|
| 95 |
+
# Skip generic YouTube search result pages
|
| 96 |
+
if 'results?search_query=' in url:
|
| 97 |
+
return False
|
| 98 |
+
|
| 99 |
+
# Skip non-YouTube URLs that aren't medical platforms
|
| 100 |
+
if 'youtube.com' not in url and not any(med in url for med in ['medscape.com', 'vinmec.com', 'haodf.com']):
|
| 101 |
+
return False
|
| 102 |
+
|
| 103 |
+
# Check if title contains medical keywords or query terms
|
| 104 |
+
title_lower = title.lower()
|
| 105 |
+
query_lower = query.lower()
|
| 106 |
+
|
| 107 |
+
medical_keywords = [
|
| 108 |
+
'medical', 'health', 'doctor', 'treatment', 'diagnosis',
|
| 109 |
+
'symptoms', 'therapy', 'medicine', 'clinical', 'patient',
|
| 110 |
+
'disease', 'condition', 'healthcare', 'physician'
|
| 111 |
+
]
|
| 112 |
+
|
| 113 |
+
# Must contain medical keywords or query terms
|
| 114 |
+
has_medical = any(keyword in title_lower for keyword in medical_keywords)
|
| 115 |
+
has_query = any(word in title_lower for word in query_lower.split() if len(word) > 3)
|
| 116 |
+
|
| 117 |
+
return has_medical or has_query
|
| 118 |
+
|
| 119 |
+
def _search_platform_with_retry(self, query: str, platform: Dict, num_results: int, max_retries: int = 2) -> List[Dict]:
|
| 120 |
+
"""Search platform with retry logic and better error handling"""
|
| 121 |
+
for attempt in range(max_retries):
|
| 122 |
+
try:
|
| 123 |
+
return self._search_platform(query, platform, num_results)
|
| 124 |
+
except Exception as e:
|
| 125 |
+
logger.warning(f"Attempt {attempt + 1} failed for {platform['name']}: {e}")
|
| 126 |
+
if attempt < max_retries - 1:
|
| 127 |
+
time.sleep(1) # Wait before retry
|
| 128 |
+
else:
|
| 129 |
+
logger.error(f"All attempts failed for {platform['name']}")
|
| 130 |
+
return []
|
| 131 |
+
|
| 132 |
+
def search(self, query: str, num_results: int = 3, language: str = 'en') -> List[Dict]:
|
| 133 |
+
"""Search for medical videos across platforms with deduplication and medical filtering"""
|
| 134 |
+
query = self._normalize_query(query)
|
| 135 |
+
logger.info(f"Searching for medical videos: {query} (language: {language})")
|
| 136 |
+
|
| 137 |
+
results = []
|
| 138 |
+
seen_urls = set() # Track URLs to avoid duplicates
|
| 139 |
+
seen_video_ids = set() # Track video IDs to avoid duplicates
|
| 140 |
+
platforms = self.video_platforms.get(language, self.video_platforms['en'])
|
| 141 |
+
|
| 142 |
+
# Try platforms in order of reliability
|
| 143 |
+
for platform in platforms:
|
| 144 |
+
if len(results) >= num_results:
|
| 145 |
+
break
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
# Add timeout and retry logic
|
| 149 |
+
platform_results = self._search_platform_with_retry(query, platform, num_results * 3)
|
| 150 |
+
|
| 151 |
+
if not platform_results:
|
| 152 |
+
logger.warning(f"No results from {platform['name']}")
|
| 153 |
+
continue
|
| 154 |
+
|
| 155 |
+
# Filter out duplicates and non-medical content
|
| 156 |
+
for result in platform_results:
|
| 157 |
+
url = result.get('url', '')
|
| 158 |
+
video_id = self._extract_video_id(url)
|
| 159 |
+
|
| 160 |
+
# Skip if URL or video ID already seen
|
| 161 |
+
if url in seen_urls or (video_id and video_id in seen_video_ids):
|
| 162 |
+
continue
|
| 163 |
+
|
| 164 |
+
# Check if it's a valid medical video (less strict for more results)
|
| 165 |
+
if self._is_valid_medical_video(result, query):
|
| 166 |
+
seen_urls.add(url)
|
| 167 |
+
if video_id:
|
| 168 |
+
seen_video_ids.add(video_id)
|
| 169 |
+
|
| 170 |
+
# Normalize YouTube URLs
|
| 171 |
+
if video_id and 'youtube.com' in url:
|
| 172 |
+
result['url'] = f"https://www.youtube.com/watch?v={video_id}"
|
| 173 |
+
result['video_id'] = video_id
|
| 174 |
+
|
| 175 |
+
results.append(result)
|
| 176 |
+
if len(results) >= num_results:
|
| 177 |
+
break
|
| 178 |
+
|
| 179 |
+
time.sleep(0.5) # Rate limiting
|
| 180 |
+
except Exception as e:
|
| 181 |
+
logger.warning(f"Video search failed for {platform['name']}: {e}")
|
| 182 |
+
continue
|
| 183 |
+
|
| 184 |
+
# Add fallback video sources if needed
|
| 185 |
+
if len(results) < num_results:
|
| 186 |
+
# Try resilient YouTube via Invidious API
|
| 187 |
+
try:
|
| 188 |
+
resilient = self._search_youtube_invidious(query, language, num_results - len(results))
|
| 189 |
+
for result in resilient:
|
| 190 |
+
url = result.get('url', '')
|
| 191 |
+
video_id = result.get('video_id', '')
|
| 192 |
+
|
| 193 |
+
if (url not in seen_urls and
|
| 194 |
+
video_id not in seen_video_ids and
|
| 195 |
+
self._is_valid_medical_video(result, query)):
|
| 196 |
+
seen_urls.add(url)
|
| 197 |
+
if video_id:
|
| 198 |
+
seen_video_ids.add(video_id)
|
| 199 |
+
results.append(result)
|
| 200 |
+
if len(results) >= num_results:
|
| 201 |
+
break
|
| 202 |
+
except Exception as e:
|
| 203 |
+
logger.warning(f"Invidious fallback failed: {e}")
|
| 204 |
+
|
| 205 |
+
# If still no results, try generic video search fallback
|
| 206 |
+
if len(results) < num_results:
|
| 207 |
+
try:
|
| 208 |
+
fallback_results = self._get_fallback_videos(query, language, num_results - len(results))
|
| 209 |
+
for result in fallback_results:
|
| 210 |
+
if result['url'] not in seen_urls:
|
| 211 |
+
seen_urls.add(result['url'])
|
| 212 |
+
results.append(result)
|
| 213 |
+
if len(results) >= num_results:
|
| 214 |
+
break
|
| 215 |
+
logger.info(f"Added {len(fallback_results)} fallback video results")
|
| 216 |
+
except Exception as e:
|
| 217 |
+
logger.warning(f"Fallback video search failed: {e}")
|
| 218 |
+
|
| 219 |
+
# Use reranker to improve quality and relevance
|
| 220 |
+
if results:
|
| 221 |
+
reranked_results = self.reranker.filter_youtube_results(results, query)
|
| 222 |
+
logger.info(f"Reranked {len(results)} video results to {len(reranked_results)} high-quality results")
|
| 223 |
+
return reranked_results[:num_results]
|
| 224 |
+
|
| 225 |
+
logger.info(f"Found {len(results)} medical video results")
|
| 226 |
+
return results[:num_results]
|
| 227 |
+
|
| 228 |
+
def _search_platform(self, query: str, platform: Dict, num_results: int) -> List[Dict]:
|
| 229 |
+
"""Search a specific video platform with improved error handling"""
|
| 230 |
+
try:
|
| 231 |
+
search_url = platform['search_url']
|
| 232 |
+
params = platform['params'].copy()
|
| 233 |
+
|
| 234 |
+
# Set search query parameter
|
| 235 |
+
for param_name in params.keys():
|
| 236 |
+
params[param_name] = query
|
| 237 |
+
|
| 238 |
+
# Add headers to avoid blocking
|
| 239 |
+
headers = self.session.headers.copy()
|
| 240 |
+
headers.update({
|
| 241 |
+
'Referer': 'https://www.google.com/',
|
| 242 |
+
'Cache-Control': 'no-cache',
|
| 243 |
+
})
|
| 244 |
+
|
| 245 |
+
# Try with shorter timeout first
|
| 246 |
+
response = self.session.get(search_url, params=params, headers=headers, timeout=10)
|
| 247 |
+
|
| 248 |
+
# Check for common error responses
|
| 249 |
+
if response.status_code == 404:
|
| 250 |
+
logger.warning(f"Platform {platform['name']} returned 404 - endpoint may have changed")
|
| 251 |
+
return []
|
| 252 |
+
elif response.status_code == 403:
|
| 253 |
+
logger.warning(f"Platform {platform['name']} returned 403 - may be blocking requests")
|
| 254 |
+
return []
|
| 255 |
+
elif response.status_code >= 400:
|
| 256 |
+
logger.warning(f"Platform {platform['name']} returned {response.status_code}")
|
| 257 |
+
return []
|
| 258 |
+
|
| 259 |
+
response.raise_for_status()
|
| 260 |
+
|
| 261 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 262 |
+
results = []
|
| 263 |
+
|
| 264 |
+
# Try platform-specific selectors
|
| 265 |
+
selectors = platform.get('selectors', ['a[href*="video"]', 'a[href*="watch"]'])
|
| 266 |
+
|
| 267 |
+
links = []
|
| 268 |
+
for selector in selectors:
|
| 269 |
+
links = soup.select(selector)
|
| 270 |
+
if links:
|
| 271 |
+
logger.info(f"{platform['name']} found {len(links)} video links with selector: {selector}")
|
| 272 |
+
break
|
| 273 |
+
|
| 274 |
+
# If no links found, try generic selectors
|
| 275 |
+
if not links:
|
| 276 |
+
generic_selectors = ['a[href*="http"]', 'a[href*="www"]']
|
| 277 |
+
for selector in generic_selectors:
|
| 278 |
+
links = soup.select(selector)
|
| 279 |
+
if links:
|
| 280 |
+
logger.info(f"{platform['name']} found {len(links)} generic links with selector: {selector}")
|
| 281 |
+
break
|
| 282 |
+
|
| 283 |
+
for link in links[:num_results]:
|
| 284 |
+
try:
|
| 285 |
+
href = link.get('href')
|
| 286 |
+
if not href:
|
| 287 |
+
continue
|
| 288 |
+
|
| 289 |
+
# Make absolute URL
|
| 290 |
+
if href.startswith('/'):
|
| 291 |
+
href = platform['base_url'] + href
|
| 292 |
+
|
| 293 |
+
# Skip if not a valid URL
|
| 294 |
+
if not href.startswith('http'):
|
| 295 |
+
continue
|
| 296 |
+
|
| 297 |
+
title = link.get_text(strip=True) or platform['name']
|
| 298 |
+
if title and href:
|
| 299 |
+
results.append({
|
| 300 |
+
'url': href,
|
| 301 |
+
'title': title,
|
| 302 |
+
'platform': platform['name'],
|
| 303 |
+
'type': 'video',
|
| 304 |
+
'source': platform['name']
|
| 305 |
+
})
|
| 306 |
+
except Exception as e:
|
| 307 |
+
logger.debug(f"Error parsing {platform['name']} link: {e}")
|
| 308 |
+
continue
|
| 309 |
+
|
| 310 |
+
return results
|
| 311 |
+
|
| 312 |
+
except requests.exceptions.Timeout:
|
| 313 |
+
logger.warning(f"Platform {platform['name']} search timed out")
|
| 314 |
+
return []
|
| 315 |
+
except requests.exceptions.ConnectionError:
|
| 316 |
+
logger.warning(f"Platform {platform['name']} connection failed - network issue")
|
| 317 |
+
return []
|
| 318 |
+
except Exception as e:
|
| 319 |
+
logger.warning(f"Platform {platform['name']} search failed: {e}")
|
| 320 |
+
return []
|
| 321 |
+
|
| 322 |
+
def _search_youtube_invidious(self, query: str, language: str, needed: int) -> List[Dict]:
|
| 323 |
+
"""Search YouTube via public Invidious instances (no API key)."""
|
| 324 |
+
if needed <= 0:
|
| 325 |
+
return []
|
| 326 |
+
instances = [
|
| 327 |
+
"https://yewtu.be",
|
| 328 |
+
"https://invidious.flokinet.to",
|
| 329 |
+
"https://vid.puffyan.us",
|
| 330 |
+
"https://iv.ggtyler.dev"
|
| 331 |
+
]
|
| 332 |
+
out: List[Dict] = []
|
| 333 |
+
q = quote(query)
|
| 334 |
+
for base in instances:
|
| 335 |
+
if len(out) >= needed:
|
| 336 |
+
break
|
| 337 |
+
try:
|
| 338 |
+
url = f"{base}/api/v1/search?q={q}®ion={'VN' if language=='vi' else 'US'}&fields=title,videoId,author&type=video"
|
| 339 |
+
r = self.session.get(url, timeout=6)
|
| 340 |
+
r.raise_for_status()
|
| 341 |
+
data = r.json()
|
| 342 |
+
for item in data:
|
| 343 |
+
if len(out) >= needed:
|
| 344 |
+
break
|
| 345 |
+
vid = item.get("videoId")
|
| 346 |
+
title = (item.get("title") or "").strip()
|
| 347 |
+
if not vid or not title:
|
| 348 |
+
continue
|
| 349 |
+
out.append({
|
| 350 |
+
'url': f"https://www.youtube.com/watch?v={vid}",
|
| 351 |
+
'title': title,
|
| 352 |
+
'thumbnail': f"https://i.ytimg.com/vi/{vid}/hqdefault.jpg",
|
| 353 |
+
'platform': 'youtube',
|
| 354 |
+
'source': 'youtube',
|
| 355 |
+
'type': 'video',
|
| 356 |
+
'language': language
|
| 357 |
+
})
|
| 358 |
+
except Exception as e:
|
| 359 |
+
logger.debug(f"Invidious {base} failed: {e}")
|
| 360 |
+
continue
|
| 361 |
+
return out
|
| 362 |
+
|
| 363 |
+
def _get_fallback_videos(self, query: str, language: str, num_results: int) -> List[Dict]:
|
| 364 |
+
"""Get fallback video sources when direct search fails"""
|
| 365 |
+
fallback_videos = {
|
| 366 |
+
'en': [
|
| 367 |
+
{
|
| 368 |
+
'url': 'https://www.youtube.com/results?search_query=medical+' + quote(query),
|
| 369 |
+
'title': f'Medical Videos: {query}',
|
| 370 |
+
'platform': 'youtube_fallback',
|
| 371 |
+
'type': 'video',
|
| 372 |
+
'source': 'youtube'
|
| 373 |
+
},
|
| 374 |
+
{
|
| 375 |
+
'url': 'https://www.medscape.com/search?q=' + quote(query),
|
| 376 |
+
'title': f'Medscape Videos: {query}',
|
| 377 |
+
'platform': 'medscape_fallback',
|
| 378 |
+
'type': 'video',
|
| 379 |
+
'source': 'medscape'
|
| 380 |
+
}
|
| 381 |
+
],
|
| 382 |
+
'vi': [
|
| 383 |
+
{
|
| 384 |
+
'url': 'https://www.youtube.com/results?search_query=y+tế+' + quote(query),
|
| 385 |
+
'title': f'Video Y Tế: {query}',
|
| 386 |
+
'platform': 'youtube_vi_fallback',
|
| 387 |
+
'type': 'video',
|
| 388 |
+
'source': 'youtube'
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
'url': 'https://www.vinmec.com/vi/suc-khoe',
|
| 392 |
+
'title': f'Vinmec Videos: {query}',
|
| 393 |
+
'platform': 'vinmec_fallback',
|
| 394 |
+
'type': 'video',
|
| 395 |
+
'source': 'vinmec'
|
| 396 |
+
}
|
| 397 |
+
],
|
| 398 |
+
'zh': [
|
| 399 |
+
{
|
| 400 |
+
'url': 'https://www.youtube.com/results?search_query=医疗+' + quote(query),
|
| 401 |
+
'title': f'医疗视频: {query}',
|
| 402 |
+
'platform': 'youtube_zh_fallback',
|
| 403 |
+
'type': 'video',
|
| 404 |
+
'source': 'youtube'
|
| 405 |
+
},
|
| 406 |
+
{
|
| 407 |
+
'url': 'https://www.haodf.com/jibing',
|
| 408 |
+
'title': f'好大夫视频: {query}',
|
| 409 |
+
'platform': 'haodf_fallback',
|
| 410 |
+
'type': 'video',
|
| 411 |
+
'source': 'haodf'
|
| 412 |
+
}
|
| 413 |
+
]
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
return fallback_videos.get(language, fallback_videos['en'])[:num_results]
|
| 417 |
+
|
| 418 |
+
def _extract_video_id(self, url: str) -> str:
|
| 419 |
+
"""Extract YouTube video ID from URL"""
|
| 420 |
+
patterns = [
|
| 421 |
+
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
|
| 422 |
+
r'(?:embed\/)([0-9A-Za-z_-]{11})',
|
| 423 |
+
r'(?:watch\?v=)([0-9A-Za-z_-]{11})'
|
| 424 |
+
]
|
| 425 |
+
|
| 426 |
+
for pattern in patterns:
|
| 427 |
+
match = re.search(pattern, url)
|
| 428 |
+
if match:
|
| 429 |
+
return match.group(1)
|
| 430 |
+
|
| 431 |
+
return None
|
| 432 |
+
|
search/extractors/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .content import ContentExtractor
|
| 2 |
+
|
| 3 |
+
__all__ = ['ContentExtractor']
|
search/extractors/content.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import logging
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
+
import re
|
| 6 |
+
from urllib.parse import urlparse
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
logger = logging.getLogger(__name__)
|
| 10 |
+
|
| 11 |
+
class ContentExtractor:
|
| 12 |
+
"""Extract and clean content from web pages"""
|
| 13 |
+
|
| 14 |
+
def __init__(self, timeout: int = 15):
|
| 15 |
+
self.session = requests.Session()
|
| 16 |
+
self.session.headers.update({
|
| 17 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
| 18 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
| 19 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 20 |
+
'Accept-Encoding': 'gzip, deflate',
|
| 21 |
+
'Connection': 'keep-alive',
|
| 22 |
+
})
|
| 23 |
+
self.timeout = timeout
|
| 24 |
+
|
| 25 |
+
# Medical content indicators
|
| 26 |
+
self.medical_indicators = [
|
| 27 |
+
'symptom', 'treatment', 'diagnosis', 'medicine', 'medication',
|
| 28 |
+
'therapy', 'condition', 'disease', 'health', 'medical',
|
| 29 |
+
'doctor', 'physician', 'patient', 'clinical', 'study'
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
def extract(self, url: str, max_length: int = 2000) -> Optional[str]:
|
| 33 |
+
"""Extract content from a URL with medical focus"""
|
| 34 |
+
try:
|
| 35 |
+
response = self.session.get(url, timeout=self.timeout)
|
| 36 |
+
response.raise_for_status()
|
| 37 |
+
|
| 38 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 39 |
+
|
| 40 |
+
# Remove unwanted elements
|
| 41 |
+
self._remove_unwanted_elements(soup)
|
| 42 |
+
|
| 43 |
+
# Extract main content
|
| 44 |
+
content = self._extract_main_content(soup)
|
| 45 |
+
|
| 46 |
+
if not content:
|
| 47 |
+
return None
|
| 48 |
+
|
| 49 |
+
# Clean and process content
|
| 50 |
+
cleaned_content = self._clean_content(content)
|
| 51 |
+
|
| 52 |
+
# Focus on medical content if possible
|
| 53 |
+
medical_content = self._extract_medical_content(cleaned_content)
|
| 54 |
+
|
| 55 |
+
# Truncate to max length
|
| 56 |
+
final_content = self._truncate_content(medical_content or cleaned_content, max_length)
|
| 57 |
+
|
| 58 |
+
return final_content if final_content else None
|
| 59 |
+
|
| 60 |
+
except Exception as e:
|
| 61 |
+
logger.warning(f"Content extraction failed for {url}: {e}")
|
| 62 |
+
return None
|
| 63 |
+
|
| 64 |
+
def _remove_unwanted_elements(self, soup: BeautifulSoup):
|
| 65 |
+
"""Remove unwanted HTML elements"""
|
| 66 |
+
unwanted_tags = [
|
| 67 |
+
'script', 'style', 'nav', 'header', 'footer', 'aside',
|
| 68 |
+
'advertisement', 'ads', 'sidebar', 'menu', 'navigation',
|
| 69 |
+
'social', 'share', 'comment', 'comments', 'related',
|
| 70 |
+
'cookie', 'privacy', 'terms', 'disclaimer'
|
| 71 |
+
]
|
| 72 |
+
|
| 73 |
+
for tag in unwanted_tags:
|
| 74 |
+
for element in soup.find_all(tag):
|
| 75 |
+
element.decompose()
|
| 76 |
+
|
| 77 |
+
# Remove elements with unwanted classes/ids
|
| 78 |
+
unwanted_selectors = [
|
| 79 |
+
'[class*="ad"]', '[class*="advertisement"]', '[class*="sidebar"]',
|
| 80 |
+
'[class*="menu"]', '[class*="nav"]', '[class*="social"]',
|
| 81 |
+
'[class*="share"]', '[class*="comment"]', '[class*="related"]',
|
| 82 |
+
'[id*="ad"]', '[id*="sidebar"]', '[id*="menu"]', '[id*="nav"]'
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
for selector in unwanted_selectors:
|
| 86 |
+
for element in soup.select(selector):
|
| 87 |
+
element.decompose()
|
| 88 |
+
|
| 89 |
+
def _extract_main_content(self, soup: BeautifulSoup) -> str:
|
| 90 |
+
"""Extract main content from the page"""
|
| 91 |
+
# Priority order for content extraction
|
| 92 |
+
content_selectors = [
|
| 93 |
+
'article',
|
| 94 |
+
'main',
|
| 95 |
+
'[role="main"]',
|
| 96 |
+
'.content',
|
| 97 |
+
'.main-content',
|
| 98 |
+
'.article-content',
|
| 99 |
+
'.post-content',
|
| 100 |
+
'.entry-content',
|
| 101 |
+
'.page-content',
|
| 102 |
+
'body'
|
| 103 |
+
]
|
| 104 |
+
|
| 105 |
+
for selector in content_selectors:
|
| 106 |
+
elements = soup.select(selector)
|
| 107 |
+
if elements:
|
| 108 |
+
# Get the largest content element
|
| 109 |
+
largest_element = max(elements, key=lambda x: len(x.get_text()))
|
| 110 |
+
content = largest_element.get_text(separator=' ', strip=True)
|
| 111 |
+
if len(content) > 100: # Minimum content length
|
| 112 |
+
return content
|
| 113 |
+
|
| 114 |
+
# Fallback: get all text
|
| 115 |
+
return soup.get_text(separator=' ', strip=True)
|
| 116 |
+
|
| 117 |
+
def _clean_content(self, content: str) -> str:
|
| 118 |
+
"""Clean and normalize content"""
|
| 119 |
+
if not content:
|
| 120 |
+
return ""
|
| 121 |
+
|
| 122 |
+
# Remove excessive whitespace
|
| 123 |
+
content = re.sub(r'\s+', ' ', content)
|
| 124 |
+
|
| 125 |
+
# Remove common web artifacts
|
| 126 |
+
artifacts = [
|
| 127 |
+
r'Cookie\s+Policy',
|
| 128 |
+
r'Privacy\s+Policy',
|
| 129 |
+
r'Terms\s+of\s+Service',
|
| 130 |
+
r'Subscribe\s+to\s+our\s+newsletter',
|
| 131 |
+
r'Follow\s+us\s+on',
|
| 132 |
+
r'Share\s+this\s+article',
|
| 133 |
+
r'Related\s+articles',
|
| 134 |
+
r'Advertisement',
|
| 135 |
+
r'Ad\s+content'
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
for artifact in artifacts:
|
| 139 |
+
content = re.sub(artifact, '', content, flags=re.IGNORECASE)
|
| 140 |
+
|
| 141 |
+
# Remove excessive punctuation
|
| 142 |
+
content = re.sub(r'[.]{3,}', '...', content)
|
| 143 |
+
content = re.sub(r'[!]{2,}', '!', content)
|
| 144 |
+
content = re.sub(r'[?]{2,}', '?', content)
|
| 145 |
+
|
| 146 |
+
return content.strip()
|
| 147 |
+
|
| 148 |
+
def _extract_medical_content(self, content: str) -> Optional[str]:
|
| 149 |
+
"""Extract medical-focused content from the text"""
|
| 150 |
+
if not content:
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
# Split content into sentences
|
| 154 |
+
sentences = re.split(r'[.!?]+', content)
|
| 155 |
+
medical_sentences = []
|
| 156 |
+
|
| 157 |
+
for sentence in sentences:
|
| 158 |
+
sentence = sentence.strip()
|
| 159 |
+
if len(sentence) < 20: # Skip very short sentences
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
# Check if sentence contains medical indicators
|
| 163 |
+
sentence_lower = sentence.lower()
|
| 164 |
+
if any(indicator in sentence_lower for indicator in self.medical_indicators):
|
| 165 |
+
medical_sentences.append(sentence)
|
| 166 |
+
|
| 167 |
+
if medical_sentences:
|
| 168 |
+
# Return medical sentences, prioritizing longer ones
|
| 169 |
+
medical_sentences.sort(key=len, reverse=True)
|
| 170 |
+
return '. '.join(medical_sentences[:10]) + '.'
|
| 171 |
+
|
| 172 |
+
return None
|
| 173 |
+
|
| 174 |
+
def _truncate_content(self, content: str, max_length: int) -> str:
|
| 175 |
+
"""Truncate content to max length while preserving sentences"""
|
| 176 |
+
if len(content) <= max_length:
|
| 177 |
+
return content
|
| 178 |
+
|
| 179 |
+
# Try to truncate at sentence boundary
|
| 180 |
+
truncated = content[:max_length]
|
| 181 |
+
last_period = truncated.rfind('.')
|
| 182 |
+
last_exclamation = truncated.rfind('!')
|
| 183 |
+
last_question = truncated.rfind('?')
|
| 184 |
+
|
| 185 |
+
last_sentence_end = max(last_period, last_exclamation, last_question)
|
| 186 |
+
|
| 187 |
+
if last_sentence_end > max_length * 0.7: # If we can find a good break point
|
| 188 |
+
return content[:last_sentence_end + 1]
|
| 189 |
+
|
| 190 |
+
# Fallback: truncate at word boundary
|
| 191 |
+
words = truncated.split()
|
| 192 |
+
if len(words) > 1:
|
| 193 |
+
return ' '.join(words[:-1]) + '...'
|
| 194 |
+
|
| 195 |
+
return truncated + '...'
|
| 196 |
+
|
| 197 |
+
def extract_multiple(self, urls: list, max_length: int = 2000) -> Dict[str, str]:
|
| 198 |
+
"""Extract content from multiple URLs"""
|
| 199 |
+
results = {}
|
| 200 |
+
|
| 201 |
+
for url in urls:
|
| 202 |
+
try:
|
| 203 |
+
content = self.extract(url, max_length)
|
| 204 |
+
if content:
|
| 205 |
+
results[url] = content
|
| 206 |
+
time.sleep(0.5) # Be respectful to servers
|
| 207 |
+
except Exception as e:
|
| 208 |
+
logger.warning(f"Failed to extract content from {url}: {e}")
|
| 209 |
+
continue
|
| 210 |
+
|
| 211 |
+
return results
|
search/processors/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .medical import MedicalSearchProcessor
|
| 2 |
+
from .language import LanguageProcessor
|
| 3 |
+
from .sources import SourceAggregator
|
| 4 |
+
from .enhanced import EnhancedContentProcessor
|
| 5 |
+
|
| 6 |
+
__all__ = ['MedicalSearchProcessor', 'LanguageProcessor', 'SourceAggregator', 'EnhancedContentProcessor']
|
search/processors/cooking.py
ADDED
|
@@ -0,0 +1,258 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Tuple
|
| 3 |
+
from models.summarizer import summarizer
|
| 4 |
+
import re
|
| 5 |
+
|
| 6 |
+
logger = logging.getLogger(__name__)
|
| 7 |
+
|
| 8 |
+
class CookingSearchProcessor:
|
| 9 |
+
"""Process and enhance cooking search results"""
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.cooking_keywords = [
|
| 13 |
+
'recipe', 'cooking', 'baking', 'roasting', 'grilling', 'frying', 'boiling', 'steaming',
|
| 14 |
+
'ingredients', 'seasoning', 'spices', 'herbs', 'sauce', 'marinade', 'dressing',
|
| 15 |
+
'technique', 'method', 'temperature', 'timing', 'preparation', 'cooking time',
|
| 16 |
+
'oven', 'stovetop', 'grill', 'pan', 'pot', 'skillet', 'knife', 'cutting',
|
| 17 |
+
'vegetarian', 'vegan', 'gluten-free', 'dairy-free', 'keto', 'paleo', 'diet',
|
| 18 |
+
'appetizer', 'main course', 'dessert', 'breakfast', 'lunch', 'dinner',
|
| 19 |
+
'cuisine', 'italian', 'chinese', 'mexican', 'french', 'indian', 'thai',
|
| 20 |
+
'substitution', 'alternative', 'variation', 'modification', 'adaptation',
|
| 21 |
+
'troubleshooting', 'tips', 'tricks', 'hacks', 'mistakes', 'common errors'
|
| 22 |
+
]
|
| 23 |
+
|
| 24 |
+
def process_results(self, results: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
|
| 25 |
+
"""Process search results and create comprehensive cooking summary"""
|
| 26 |
+
if not results:
|
| 27 |
+
return "", {}
|
| 28 |
+
|
| 29 |
+
# Filter and rank results by cooking relevance
|
| 30 |
+
relevant_results = self._filter_cooking_results(results, user_query)
|
| 31 |
+
|
| 32 |
+
if not relevant_results:
|
| 33 |
+
logger.warning("No cooking-relevant results found")
|
| 34 |
+
return "", {}
|
| 35 |
+
|
| 36 |
+
# Extract and summarize content
|
| 37 |
+
summarized_results = self._summarize_results(relevant_results, user_query)
|
| 38 |
+
|
| 39 |
+
# Create comprehensive summary
|
| 40 |
+
combined_summary = self._create_combined_summary(summarized_results, user_query)
|
| 41 |
+
|
| 42 |
+
# Create URL mapping for citations
|
| 43 |
+
url_mapping = self._create_url_mapping(relevant_results)
|
| 44 |
+
|
| 45 |
+
return combined_summary, url_mapping
|
| 46 |
+
|
| 47 |
+
def _filter_cooking_results(self, results: List[Dict], user_query: str) -> List[Dict]:
|
| 48 |
+
"""Filter results by cooking relevance"""
|
| 49 |
+
relevant_results = []
|
| 50 |
+
|
| 51 |
+
for result in results:
|
| 52 |
+
relevance_score = self._calculate_relevance_score(result, user_query)
|
| 53 |
+
|
| 54 |
+
if relevance_score > 0.3: # Threshold for cooking relevance
|
| 55 |
+
result['relevance_score'] = relevance_score
|
| 56 |
+
relevant_results.append(result)
|
| 57 |
+
|
| 58 |
+
# Sort by relevance score
|
| 59 |
+
relevant_results.sort(key=lambda x: x.get('relevance_score', 0), reverse=True)
|
| 60 |
+
|
| 61 |
+
# Limit to top results
|
| 62 |
+
return relevant_results[:10]
|
| 63 |
+
|
| 64 |
+
def _calculate_relevance_score(self, result: Dict, user_query: str) -> float:
|
| 65 |
+
"""Calculate cooking relevance score for a result"""
|
| 66 |
+
score = 0.0
|
| 67 |
+
|
| 68 |
+
# Check title relevance
|
| 69 |
+
title = result.get('title', '').lower()
|
| 70 |
+
query_lower = user_query.lower()
|
| 71 |
+
|
| 72 |
+
# Direct query match in title
|
| 73 |
+
if any(word in title for word in query_lower.split()):
|
| 74 |
+
score += 0.4
|
| 75 |
+
|
| 76 |
+
# Cooking keyword match in title
|
| 77 |
+
cooking_matches = sum(1 for keyword in self.cooking_keywords if keyword in title)
|
| 78 |
+
score += min(cooking_matches * 0.1, 0.3)
|
| 79 |
+
|
| 80 |
+
# Domain credibility for cooking sources
|
| 81 |
+
url = result.get('url', '').lower()
|
| 82 |
+
credible_domains = [
|
| 83 |
+
'allrecipes.com', 'foodnetwork.com', 'epicurious.com', 'seriouseats.com',
|
| 84 |
+
'bonappetit.com', 'cooking.nytimes.com', 'tasteofhome.com', 'food.com',
|
| 85 |
+
'bbcgoodfood.com', 'jamieoliver.com', 'gordonramsay.com', 'marthastewart.com',
|
| 86 |
+
'kingarthurbaking.com', 'sallysbakingaddiction.com', 'smittenkitchen.com'
|
| 87 |
+
]
|
| 88 |
+
|
| 89 |
+
if any(domain in url for domain in credible_domains):
|
| 90 |
+
score += 0.3
|
| 91 |
+
|
| 92 |
+
# Source type bonus for cooking
|
| 93 |
+
source = result.get('source', '')
|
| 94 |
+
if 'cooking' in source or 'recipe' in source or any(domain in source for domain in credible_domains):
|
| 95 |
+
score += 0.2
|
| 96 |
+
|
| 97 |
+
return min(score, 1.0)
|
| 98 |
+
|
| 99 |
+
def _summarize_results(self, results: List[Dict], user_query: str) -> List[Dict]:
|
| 100 |
+
"""Summarize content from search results"""
|
| 101 |
+
summarized_results = []
|
| 102 |
+
|
| 103 |
+
for i, result in enumerate(results):
|
| 104 |
+
try:
|
| 105 |
+
content = result.get('content', '')
|
| 106 |
+
if not content:
|
| 107 |
+
continue
|
| 108 |
+
|
| 109 |
+
# Create focused summary
|
| 110 |
+
summary = summarizer.summarize_for_query(content, user_query, max_length=300)
|
| 111 |
+
|
| 112 |
+
if summary:
|
| 113 |
+
summarized_results.append({
|
| 114 |
+
'id': i + 1,
|
| 115 |
+
'url': result['url'],
|
| 116 |
+
'title': result['title'],
|
| 117 |
+
'summary': summary,
|
| 118 |
+
'relevance_score': result.get('relevance_score', 0)
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.warning(f"Failed to summarize result {i}: {e}")
|
| 123 |
+
continue
|
| 124 |
+
|
| 125 |
+
return summarized_results
|
| 126 |
+
|
| 127 |
+
def _create_combined_summary(self, summarized_results: List[Dict], user_query: str) -> str:
|
| 128 |
+
"""Create a comprehensive summary from all results with proper source attribution"""
|
| 129 |
+
if not summarized_results:
|
| 130 |
+
return ""
|
| 131 |
+
|
| 132 |
+
logger.info(f"Creating combined summary from {len(summarized_results)} results")
|
| 133 |
+
|
| 134 |
+
# Group by topic/similarity
|
| 135 |
+
topic_groups = self._group_by_topic(summarized_results)
|
| 136 |
+
|
| 137 |
+
summary_parts = []
|
| 138 |
+
citation_counter = 1
|
| 139 |
+
|
| 140 |
+
for topic, results in topic_groups.items():
|
| 141 |
+
if not results:
|
| 142 |
+
continue
|
| 143 |
+
|
| 144 |
+
logger.info(f"Processing {topic} topic with {len(results)} results")
|
| 145 |
+
|
| 146 |
+
# Create topic summary with source attribution
|
| 147 |
+
topic_summary = self._create_topic_summary(topic, results, user_query, citation_counter)
|
| 148 |
+
if topic_summary:
|
| 149 |
+
summary_parts.append(topic_summary)
|
| 150 |
+
# Update citation counter for next topic
|
| 151 |
+
citation_counter += len([r for r in results if r.get('summary')])
|
| 152 |
+
|
| 153 |
+
# Combine all parts
|
| 154 |
+
combined_summary = "\n\n".join(summary_parts)
|
| 155 |
+
|
| 156 |
+
# Don't over-summarize - keep source attribution intact
|
| 157 |
+
if len(combined_summary) > 2000:
|
| 158 |
+
# Only truncate if absolutely necessary, but preserve structure
|
| 159 |
+
lines = combined_summary.split('\n')
|
| 160 |
+
truncated_lines = []
|
| 161 |
+
current_length = 0
|
| 162 |
+
|
| 163 |
+
for line in lines:
|
| 164 |
+
if current_length + len(line) > 2000:
|
| 165 |
+
break
|
| 166 |
+
truncated_lines.append(line)
|
| 167 |
+
current_length += len(line)
|
| 168 |
+
|
| 169 |
+
combined_summary = '\n'.join(truncated_lines)
|
| 170 |
+
if len(truncated_lines) < len(lines):
|
| 171 |
+
combined_summary += "\n\n*[Additional information available from multiple sources]*"
|
| 172 |
+
|
| 173 |
+
logger.info(f"Final combined summary length: {len(combined_summary)} characters")
|
| 174 |
+
return combined_summary
|
| 175 |
+
|
| 176 |
+
def _group_by_topic(self, results: List[Dict]) -> Dict[str, List[Dict]]:
|
| 177 |
+
"""Group results by cooking topic"""
|
| 178 |
+
topics = {
|
| 179 |
+
'recipes': [],
|
| 180 |
+
'techniques': [],
|
| 181 |
+
'ingredients': [],
|
| 182 |
+
'general': []
|
| 183 |
+
}
|
| 184 |
+
|
| 185 |
+
for result in results:
|
| 186 |
+
title_lower = result['title'].lower()
|
| 187 |
+
summary_lower = result.get('summary', '').lower()
|
| 188 |
+
content_lower = f"{title_lower} {summary_lower}"
|
| 189 |
+
|
| 190 |
+
# Categorize by content
|
| 191 |
+
if any(word in content_lower for word in ['recipe', 'ingredients', 'instructions', 'steps']):
|
| 192 |
+
topics['recipes'].append(result)
|
| 193 |
+
elif any(word in content_lower for word in ['technique', 'method', 'how to', 'cooking']):
|
| 194 |
+
topics['techniques'].append(result)
|
| 195 |
+
elif any(word in content_lower for word in ['ingredients', 'substitution', 'alternative', 'variation']):
|
| 196 |
+
topics['ingredients'].append(result)
|
| 197 |
+
else:
|
| 198 |
+
topics['general'].append(result)
|
| 199 |
+
|
| 200 |
+
return topics
|
| 201 |
+
|
| 202 |
+
def _create_topic_summary(self, topic: str, results: List[Dict], user_query: str, citation_start: int = 1) -> str:
|
| 203 |
+
"""Create summary for a specific topic with source attribution"""
|
| 204 |
+
if not results:
|
| 205 |
+
return ""
|
| 206 |
+
|
| 207 |
+
# Add topic header
|
| 208 |
+
topic_headers = {
|
| 209 |
+
'recipes': "**Recipes and Instructions:**",
|
| 210 |
+
'techniques': "**Cooking Techniques:**",
|
| 211 |
+
'ingredients': "**Ingredients and Substitutions:**",
|
| 212 |
+
'general': "**General Information:**"
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
header = topic_headers.get(topic, "**Information:**")
|
| 216 |
+
summary_parts = [header]
|
| 217 |
+
|
| 218 |
+
# Process each result individually to maintain source attribution
|
| 219 |
+
for i, result in enumerate(results[:3]): # Limit to top 3 per topic
|
| 220 |
+
summary = result.get('summary', '')
|
| 221 |
+
if not summary:
|
| 222 |
+
continue
|
| 223 |
+
|
| 224 |
+
# Extract domain from URL for source attribution
|
| 225 |
+
url = result.get('url', '')
|
| 226 |
+
domain = self._extract_domain(url)
|
| 227 |
+
|
| 228 |
+
# Use proper citation number
|
| 229 |
+
citation_num = citation_start + i
|
| 230 |
+
|
| 231 |
+
# Add source attribution
|
| 232 |
+
summary_with_source = f"* {summary} <#{citation_num}>"
|
| 233 |
+
summary_parts.append(summary_with_source)
|
| 234 |
+
|
| 235 |
+
return "\n".join(summary_parts)
|
| 236 |
+
|
| 237 |
+
def _extract_domain(self, url: str) -> str:
|
| 238 |
+
"""Extract domain name from URL"""
|
| 239 |
+
try:
|
| 240 |
+
from urllib.parse import urlparse
|
| 241 |
+
parsed = urlparse(url)
|
| 242 |
+
domain = parsed.netloc.lower()
|
| 243 |
+
# Remove www. prefix
|
| 244 |
+
if domain.startswith('www.'):
|
| 245 |
+
domain = domain[4:]
|
| 246 |
+
return domain
|
| 247 |
+
except:
|
| 248 |
+
return ""
|
| 249 |
+
|
| 250 |
+
def _create_url_mapping(self, results: List[Dict]) -> Dict[int, str]:
|
| 251 |
+
"""Create URL mapping for citations"""
|
| 252 |
+
url_mapping = {}
|
| 253 |
+
|
| 254 |
+
for i, result in enumerate(results):
|
| 255 |
+
url_mapping[i + 1] = result['url']
|
| 256 |
+
|
| 257 |
+
logger.info(f"Created URL mapping for {len(url_mapping)} sources")
|
| 258 |
+
return url_mapping
|
search/processors/enhanced.py
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Tuple, Set
|
| 3 |
+
import re
|
| 4 |
+
from collections import defaultdict
|
| 5 |
+
from models.summarizer import summarizer
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
class EnhancedContentProcessor:
|
| 10 |
+
"""Enhanced content processing for maximum information extraction"""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
# Cooking content patterns for extraction
|
| 14 |
+
self.cooking_patterns = {
|
| 15 |
+
'ingredients': [
|
| 16 |
+
r'ingredients?\s+(?:include|are|may include|can include)',
|
| 17 |
+
r'you\s+need',
|
| 18 |
+
r'required\s+ingredients?',
|
| 19 |
+
r'main\s+ingredients?',
|
| 20 |
+
r'key\s+ingredients?'
|
| 21 |
+
],
|
| 22 |
+
'techniques': [
|
| 23 |
+
r'techniques?\s+(?:include|are|may include|can include)',
|
| 24 |
+
r'cooking\s+methods?',
|
| 25 |
+
r'preparation\s+methods?',
|
| 26 |
+
r'how\s+to\s+cook',
|
| 27 |
+
r'cooking\s+process'
|
| 28 |
+
],
|
| 29 |
+
'instructions': [
|
| 30 |
+
r'instructions?\s+(?:include|are|may include)',
|
| 31 |
+
r'steps?\s+(?:include|are|may include)',
|
| 32 |
+
r'how\s+to\s+make',
|
| 33 |
+
r'preparation\s+steps?',
|
| 34 |
+
r'cooking\s+steps?'
|
| 35 |
+
],
|
| 36 |
+
'timing': [
|
| 37 |
+
r'timing\s+(?:include|are|may include)',
|
| 38 |
+
r'cooking\s+time',
|
| 39 |
+
r'preparation\s+time',
|
| 40 |
+
r'total\s+time',
|
| 41 |
+
r'duration'
|
| 42 |
+
],
|
| 43 |
+
'tips': [
|
| 44 |
+
r'tips?\s+(?:include|are|may include)',
|
| 45 |
+
r'advice\s+(?:include|are|may include)',
|
| 46 |
+
r'recommendations?',
|
| 47 |
+
r'helpful\s+hints?',
|
| 48 |
+
r'secrets?'
|
| 49 |
+
],
|
| 50 |
+
'variations': [
|
| 51 |
+
r'variations?\s+(?:include|are|may include)',
|
| 52 |
+
r'substitutions?\s+(?:include|are|may include)',
|
| 53 |
+
r'alternatives?\s+(?:include|are|may include)',
|
| 54 |
+
r'modifications?\s+(?:include|are|may include)',
|
| 55 |
+
r'complications?'
|
| 56 |
+
]
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# Content quality indicators
|
| 60 |
+
self.quality_indicators = {
|
| 61 |
+
'high': [
|
| 62 |
+
'professional chef', 'culinary institute', 'food science', 'nutrition research',
|
| 63 |
+
'evidence-based', 'peer-reviewed', 'published study', 'research shows',
|
| 64 |
+
'culinary guidelines', 'chef consensus', 'expert opinion'
|
| 65 |
+
],
|
| 66 |
+
'medium': [
|
| 67 |
+
'studies show', 'research indicates', 'culinary literature',
|
| 68 |
+
'professional experience', 'case studies', 'observational studies'
|
| 69 |
+
],
|
| 70 |
+
'low': [
|
| 71 |
+
'some people', 'may help', 'could be', 'might work',
|
| 72 |
+
'anecdotal', 'personal experience', 'unverified'
|
| 73 |
+
]
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
def process_comprehensive_content(self, sources: List[Dict], user_query: str) -> Tuple[str, Dict[int, str]]:
|
| 77 |
+
"""Process all sources to extract maximum relevant information"""
|
| 78 |
+
if not sources:
|
| 79 |
+
return "", {}
|
| 80 |
+
|
| 81 |
+
logger.info(f"Processing {len(sources)} sources for comprehensive information extraction")
|
| 82 |
+
|
| 83 |
+
# Extract structured information from each source
|
| 84 |
+
structured_info = self._extract_structured_information(sources, user_query)
|
| 85 |
+
|
| 86 |
+
# Create comprehensive summary
|
| 87 |
+
comprehensive_summary = self._create_comprehensive_summary(structured_info, user_query)
|
| 88 |
+
|
| 89 |
+
# Create detailed reference mapping
|
| 90 |
+
reference_mapping = self._create_detailed_reference_mapping(sources)
|
| 91 |
+
|
| 92 |
+
return comprehensive_summary, reference_mapping
|
| 93 |
+
|
| 94 |
+
def _extract_structured_information(self, sources: List[Dict], user_query: str) -> Dict[str, List[Dict]]:
|
| 95 |
+
"""Extract structured information by medical categories"""
|
| 96 |
+
structured_info = defaultdict(list)
|
| 97 |
+
|
| 98 |
+
for source in sources:
|
| 99 |
+
content = source.get('content', '')
|
| 100 |
+
if not content:
|
| 101 |
+
continue
|
| 102 |
+
|
| 103 |
+
# Extract information by medical categories
|
| 104 |
+
for category, patterns in self.medical_patterns.items():
|
| 105 |
+
extracted_info = self._extract_category_info(content, patterns, category, user_query)
|
| 106 |
+
if extracted_info:
|
| 107 |
+
structured_info[category].append({
|
| 108 |
+
'content': extracted_info,
|
| 109 |
+
'source': source,
|
| 110 |
+
'relevance_score': self._calculate_relevance_score(extracted_info, user_query)
|
| 111 |
+
})
|
| 112 |
+
|
| 113 |
+
# Sort by relevance within each category
|
| 114 |
+
for category in structured_info:
|
| 115 |
+
structured_info[category].sort(key=lambda x: x['relevance_score'], reverse=True)
|
| 116 |
+
|
| 117 |
+
return dict(structured_info)
|
| 118 |
+
|
| 119 |
+
def _extract_category_info(self, content: str, patterns: List[str], category: str, user_query: str) -> str:
|
| 120 |
+
"""Extract information for a specific cooking category"""
|
| 121 |
+
extracted_sentences = []
|
| 122 |
+
|
| 123 |
+
# Split content into sentences
|
| 124 |
+
sentences = re.split(r'[.!?]+', content)
|
| 125 |
+
|
| 126 |
+
for sentence in sentences:
|
| 127 |
+
sentence = sentence.strip()
|
| 128 |
+
if len(sentence) < 20: # Skip very short sentences
|
| 129 |
+
continue
|
| 130 |
+
|
| 131 |
+
# Check if sentence matches any pattern for this category
|
| 132 |
+
for pattern in patterns:
|
| 133 |
+
if re.search(pattern, sentence, re.IGNORECASE):
|
| 134 |
+
# Check relevance to user query
|
| 135 |
+
if self._is_relevant_to_query(sentence, user_query):
|
| 136 |
+
extracted_sentences.append(sentence)
|
| 137 |
+
break
|
| 138 |
+
|
| 139 |
+
# Combine and summarize extracted sentences
|
| 140 |
+
if extracted_sentences:
|
| 141 |
+
combined_text = '. '.join(extracted_sentences[:5]) # Limit to top 5 sentences
|
| 142 |
+
return summarizer.summarize_for_query(combined_text, user_query, max_length=300)
|
| 143 |
+
|
| 144 |
+
return ""
|
| 145 |
+
|
| 146 |
+
def _is_relevant_to_query(self, sentence: str, user_query: str) -> bool:
|
| 147 |
+
"""Check if sentence is relevant to user query"""
|
| 148 |
+
query_words = set(user_query.lower().split())
|
| 149 |
+
sentence_words = set(sentence.lower().split())
|
| 150 |
+
|
| 151 |
+
# Calculate word overlap
|
| 152 |
+
overlap = len(query_words.intersection(sentence_words))
|
| 153 |
+
return overlap >= 2 # At least 2 words in common
|
| 154 |
+
|
| 155 |
+
def _calculate_relevance_score(self, content: str, user_query: str) -> float:
|
| 156 |
+
"""Calculate relevance score for content"""
|
| 157 |
+
if not content or not user_query:
|
| 158 |
+
return 0.0
|
| 159 |
+
|
| 160 |
+
query_words = set(user_query.lower().split())
|
| 161 |
+
content_words = set(content.lower().split())
|
| 162 |
+
|
| 163 |
+
# Word overlap score
|
| 164 |
+
overlap = len(query_words.intersection(content_words))
|
| 165 |
+
overlap_score = overlap / len(query_words) if query_words else 0
|
| 166 |
+
|
| 167 |
+
# Content quality score
|
| 168 |
+
quality_score = self._assess_content_quality(content)
|
| 169 |
+
|
| 170 |
+
# Length score (prefer medium-length content)
|
| 171 |
+
length_score = min(len(content) / 500, 1.0) # Normalize to 0-1
|
| 172 |
+
|
| 173 |
+
# Composite score
|
| 174 |
+
composite_score = (
|
| 175 |
+
overlap_score * 0.5 + # 50% relevance to query
|
| 176 |
+
quality_score * 0.3 + # 30% content quality
|
| 177 |
+
length_score * 0.2 # 20% appropriate length
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
return min(composite_score, 1.0)
|
| 181 |
+
|
| 182 |
+
def _assess_content_quality(self, content: str) -> float:
|
| 183 |
+
"""Assess content quality based on cooking indicators"""
|
| 184 |
+
content_lower = content.lower()
|
| 185 |
+
|
| 186 |
+
high_indicators = sum(1 for indicator in self.quality_indicators['high'] if indicator in content_lower)
|
| 187 |
+
medium_indicators = sum(1 for indicator in self.quality_indicators['medium'] if indicator in content_lower)
|
| 188 |
+
low_indicators = sum(1 for indicator in self.quality_indicators['low'] if indicator in content_lower)
|
| 189 |
+
|
| 190 |
+
# Calculate quality score
|
| 191 |
+
if high_indicators > 0:
|
| 192 |
+
return 0.9
|
| 193 |
+
elif medium_indicators > 0:
|
| 194 |
+
return 0.7
|
| 195 |
+
elif low_indicators > 0:
|
| 196 |
+
return 0.5
|
| 197 |
+
else:
|
| 198 |
+
return 0.6 # Default score for neutral content
|
| 199 |
+
|
| 200 |
+
def _create_comprehensive_summary(self, structured_info: Dict[str, List[Dict]], user_query: str) -> str:
|
| 201 |
+
"""Create comprehensive summary from structured information"""
|
| 202 |
+
if not structured_info:
|
| 203 |
+
return ""
|
| 204 |
+
|
| 205 |
+
summary_parts = []
|
| 206 |
+
|
| 207 |
+
# Process each category
|
| 208 |
+
category_headers = {
|
| 209 |
+
'ingredients': "**🥘 Ingredients & Shopping:**",
|
| 210 |
+
'techniques': "**👨🍳 Cooking Techniques:**",
|
| 211 |
+
'instructions': "**📋 Step-by-Step Instructions:**",
|
| 212 |
+
'timing': "**⏰ Timing & Preparation:**",
|
| 213 |
+
'tips': "**💡 Pro Tips & Tricks:**",
|
| 214 |
+
'variations': "**🔄 Variations & Substitutions:**"
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
for category, info_list in structured_info.items():
|
| 218 |
+
if not info_list:
|
| 219 |
+
continue
|
| 220 |
+
|
| 221 |
+
# Take top 2 most relevant items for each category
|
| 222 |
+
top_items = info_list[:2]
|
| 223 |
+
|
| 224 |
+
category_content = []
|
| 225 |
+
for item in top_items:
|
| 226 |
+
content = item['content']
|
| 227 |
+
if content:
|
| 228 |
+
category_content.append(content)
|
| 229 |
+
|
| 230 |
+
if category_content:
|
| 231 |
+
# Combine and summarize category content
|
| 232 |
+
combined_content = ' '.join(category_content)
|
| 233 |
+
category_summary = summarizer.summarize_for_query(combined_content, user_query, max_length=400)
|
| 234 |
+
|
| 235 |
+
if category_summary:
|
| 236 |
+
header = category_headers.get(category, f"**{category.title()}:**")
|
| 237 |
+
summary_parts.append(f"{header}\n{category_summary}")
|
| 238 |
+
|
| 239 |
+
# Combine all parts
|
| 240 |
+
comprehensive_summary = "\n\n".join(summary_parts)
|
| 241 |
+
|
| 242 |
+
# Final summarization to ensure conciseness
|
| 243 |
+
if len(comprehensive_summary) > 2000:
|
| 244 |
+
comprehensive_summary = summarizer.summarize_text(comprehensive_summary, max_length=2000)
|
| 245 |
+
|
| 246 |
+
return comprehensive_summary
|
| 247 |
+
|
| 248 |
+
def _create_detailed_reference_mapping(self, sources: List[Dict]) -> Dict[int, Dict]:
|
| 249 |
+
"""Create detailed reference mapping with source metadata"""
|
| 250 |
+
reference_mapping = {}
|
| 251 |
+
|
| 252 |
+
for i, source in enumerate(sources, 1):
|
| 253 |
+
# Be defensive: some upstream sources may miss optional fields
|
| 254 |
+
reference_mapping[i] = {
|
| 255 |
+
'url': source.get('url', ''),
|
| 256 |
+
'title': source.get('title', ''),
|
| 257 |
+
'domain': source.get('domain', ''),
|
| 258 |
+
'source_type': source.get('source_type', 'text'),
|
| 259 |
+
'language': source.get('language', 'en'),
|
| 260 |
+
'type': source.get('type', 'text'),
|
| 261 |
+
'content_length': len(source.get('content', '')),
|
| 262 |
+
'composite_score': source.get('composite_score', 0.7)
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
return reference_mapping
|
| 266 |
+
|
| 267 |
+
def create_inline_citations(self, text: str, reference_mapping: Dict[int, Dict]) -> str:
|
| 268 |
+
"""Create inline citations within the text"""
|
| 269 |
+
if not reference_mapping:
|
| 270 |
+
return text
|
| 271 |
+
|
| 272 |
+
# Find places where citations should be added
|
| 273 |
+
# This is a simplified version - in practice, you'd use more sophisticated NLP
|
| 274 |
+
|
| 275 |
+
# Add citations after key cooking statements
|
| 276 |
+
citation_patterns = [
|
| 277 |
+
r'(ingredients?\s+(?:include|are)[^.]*\.)',
|
| 278 |
+
r'(techniques?\s+(?:include|are)[^.]*\.)',
|
| 279 |
+
r'(instructions?\s+(?:include|are)[^.]*\.)',
|
| 280 |
+
r'(timing\s+(?:include|are)[^.]*\.)',
|
| 281 |
+
r'(studies?\s+show[^.]*\.)',
|
| 282 |
+
r'(research\s+(?:indicates|shows)[^.]*\.)'
|
| 283 |
+
]
|
| 284 |
+
|
| 285 |
+
cited_text = text
|
| 286 |
+
citation_count = 1
|
| 287 |
+
|
| 288 |
+
for pattern in citation_patterns:
|
| 289 |
+
matches = re.finditer(pattern, cited_text, re.IGNORECASE)
|
| 290 |
+
for match in matches:
|
| 291 |
+
if citation_count <= len(reference_mapping):
|
| 292 |
+
citation_tag = f" <#{citation_count}>"
|
| 293 |
+
cited_text = cited_text.replace(match.group(1), match.group(1) + citation_tag, 1)
|
| 294 |
+
citation_count += 1
|
| 295 |
+
|
| 296 |
+
return cited_text
|
| 297 |
+
|
| 298 |
+
def generate_source_statistics(self, sources: List[Dict]) -> str:
|
| 299 |
+
"""Generate statistics about sources used"""
|
| 300 |
+
if not sources:
|
| 301 |
+
return ""
|
| 302 |
+
|
| 303 |
+
total_sources = len(sources)
|
| 304 |
+
# credibility removed
|
| 305 |
+
|
| 306 |
+
# Language distribution
|
| 307 |
+
languages = defaultdict(int)
|
| 308 |
+
for source in sources:
|
| 309 |
+
lang = source.get('language', 'en')
|
| 310 |
+
languages[lang] += 1
|
| 311 |
+
|
| 312 |
+
# Source type distribution
|
| 313 |
+
source_types = defaultdict(int)
|
| 314 |
+
for source in sources:
|
| 315 |
+
source_type = source.get('source_type', 'other')
|
| 316 |
+
source_types[source_type] += 1
|
| 317 |
+
|
| 318 |
+
# Content length statistics
|
| 319 |
+
content_lengths = [len(s.get('content', '')) for s in sources]
|
| 320 |
+
avg_content_length = sum(content_lengths) / len(content_lengths) if content_lengths else 0
|
| 321 |
+
|
| 322 |
+
stats_parts = []
|
| 323 |
+
stats_parts.append(f"**📊 Source Statistics:**")
|
| 324 |
+
stats_parts.append(f"• **Total Sources**: {total_sources}")
|
| 325 |
+
# removed credibility summary
|
| 326 |
+
stats_parts.append(f"• **Languages**: {', '.join([f'{count} {lang}' for lang, count in languages.items()])}")
|
| 327 |
+
stats_parts.append(f"• **Types**: {', '.join([f'{count} {type_name}' for type_name, count in source_types.items()])}")
|
| 328 |
+
stats_parts.append(f"• **Avg Content Length**: {avg_content_length:.0f} characters")
|
| 329 |
+
|
| 330 |
+
return "\n".join(stats_parts)
|
| 331 |
+
|
search/processors/language.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import logging
|
| 3 |
+
from typing import List, Dict, Tuple, Optional
|
| 4 |
+
from langdetect import detect, DetectorFactory
|
| 5 |
+
from langdetect.lang_detect_exception import LangDetectException
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
# Set seed for consistent language detection
|
| 10 |
+
DetectorFactory.seed = 0
|
| 11 |
+
|
| 12 |
+
class LanguageProcessor:
|
| 13 |
+
"""Process and enhance queries for multilingual medical search"""
|
| 14 |
+
|
| 15 |
+
def __init__(self):
|
| 16 |
+
# Medical keywords in different languages
|
| 17 |
+
self.medical_keywords = {
|
| 18 |
+
'en': [
|
| 19 |
+
'symptom', 'symptoms', 'pain', 'headache', 'migraine', 'fever', 'cough',
|
| 20 |
+
'treatment', 'treatments', 'medicine', 'medication', 'drug', 'therapy',
|
| 21 |
+
'diagnosis', 'diagnose', 'condition', 'disease', 'disorder', 'syndrome',
|
| 22 |
+
'doctor', 'physician', 'medical', 'health', 'clinical', 'patient',
|
| 23 |
+
'blood pressure', 'heart', 'lung', 'stomach', 'back', 'neck', 'chest',
|
| 24 |
+
'allergy', 'allergies', 'infection', 'inflammation', 'swelling', 'rash',
|
| 25 |
+
'sleep', 'insomnia', 'anxiety', 'depression', 'stress', 'mental health',
|
| 26 |
+
'pregnancy', 'baby', 'child', 'elderly', 'senior', 'age', 'covid',
|
| 27 |
+
'vaccine', 'immunization', 'surgery', 'operation', 'hospital', 'clinic'
|
| 28 |
+
],
|
| 29 |
+
'vi': [
|
| 30 |
+
'triệu chứng', 'đau', 'đau đầu', 'đau nửa đầu', 'sốt', 'ho',
|
| 31 |
+
'điều trị', 'thuốc', 'dược phẩm', 'liệu pháp', 'chẩn đoán',
|
| 32 |
+
'bệnh', 'tình trạng', 'rối loạn', 'hội chứng', 'bác sĩ', 'y tế',
|
| 33 |
+
'sức khỏe', 'lâm sàng', 'bệnh nhân', 'huyết áp', 'tim', 'phổi',
|
| 34 |
+
'dạ dày', 'lưng', 'cổ', 'ngực', 'dị ứng', 'nhiễm trùng',
|
| 35 |
+
'viêm', 'sưng', 'phát ban', 'ngủ', 'mất ngủ', 'lo âu',
|
| 36 |
+
'trầm cảm', 'căng thẳng', 'sức khỏe tâm thần', 'mang thai',
|
| 37 |
+
'em bé', 'trẻ em', 'người già', 'tuổi tác', 'covid', 'vaccine',
|
| 38 |
+
'tiêm chủng', 'phẫu thuật', 'bệnh viện', 'phòng khám'
|
| 39 |
+
],
|
| 40 |
+
'zh': [
|
| 41 |
+
'症状', '疼痛', '头痛', '偏头痛', '发烧', '咳嗽', '治疗', '药物',
|
| 42 |
+
'药品', '疗法', '诊断', '疾病', '状况', '紊乱', '综合征', '医生',
|
| 43 |
+
'医疗', '健康', '临床', '患者', '血压', '心脏', '肺', '胃',
|
| 44 |
+
'背部', '颈部', '胸部', '过敏', '感染', '炎症', '肿胀', '皮疹',
|
| 45 |
+
'睡眠', '失眠', '焦虑', '抑郁', '压力', '心理健康', '怀孕',
|
| 46 |
+
'婴儿', '儿童', '老年人', '年龄', '新冠', '疫苗', '免疫',
|
| 47 |
+
'手术', '医院', '诊所'
|
| 48 |
+
]
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
# Language-specific search enhancements
|
| 52 |
+
self.language_enhancements = {
|
| 53 |
+
'vi': {
|
| 54 |
+
'common_terms': ['là gì', 'nguyên nhân', 'cách điều trị', 'triệu chứng'],
|
| 55 |
+
'medical_context': ['y tế', 'sức khỏe', 'bệnh viện', 'bác sĩ']
|
| 56 |
+
},
|
| 57 |
+
'zh': {
|
| 58 |
+
'common_terms': ['是什么', '原因', '治疗方法', '症状'],
|
| 59 |
+
'medical_context': ['医疗', '健康', '医院', '医生']
|
| 60 |
+
},
|
| 61 |
+
'en': {
|
| 62 |
+
'common_terms': ['what is', 'causes', 'treatment', 'symptoms'],
|
| 63 |
+
'medical_context': ['medical', 'health', 'hospital', 'doctor']
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
def detect_language(self, text: str) -> str:
|
| 68 |
+
"""Detect the language of the input text"""
|
| 69 |
+
if not text or not text.strip():
|
| 70 |
+
return 'en' # Default to English
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
# Clean text for better detection
|
| 74 |
+
cleaned_text = re.sub(r'[^\w\s]', ' ', text)
|
| 75 |
+
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
| 76 |
+
|
| 77 |
+
if len(cleaned_text) < 3:
|
| 78 |
+
return 'en'
|
| 79 |
+
|
| 80 |
+
detected = detect(cleaned_text)
|
| 81 |
+
|
| 82 |
+
# Map detected language to our supported languages
|
| 83 |
+
language_mapping = {
|
| 84 |
+
'vi': 'vi', # Vietnamese
|
| 85 |
+
'zh-cn': 'zh', # Chinese Simplified
|
| 86 |
+
'zh-tw': 'zh', # Chinese Traditional
|
| 87 |
+
'zh': 'zh', # Chinese
|
| 88 |
+
'en': 'en' # English
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
return language_mapping.get(detected, 'en')
|
| 92 |
+
|
| 93 |
+
except LangDetectException as e:
|
| 94 |
+
logger.warning(f"Language detection failed: {e}")
|
| 95 |
+
return 'en'
|
| 96 |
+
|
| 97 |
+
def enhance_query(self, query: str, target_language: str = None) -> Dict[str, str]:
|
| 98 |
+
"""Enhance query for better search results in multiple languages"""
|
| 99 |
+
if not query or not query.strip():
|
| 100 |
+
return {}
|
| 101 |
+
|
| 102 |
+
# Detect source language
|
| 103 |
+
source_language = self.detect_language(query)
|
| 104 |
+
|
| 105 |
+
# If target language not specified, use source language
|
| 106 |
+
if target_language is None:
|
| 107 |
+
target_language = source_language
|
| 108 |
+
|
| 109 |
+
enhanced_queries = {}
|
| 110 |
+
|
| 111 |
+
# Original query
|
| 112 |
+
enhanced_queries[source_language] = query
|
| 113 |
+
|
| 114 |
+
# Enhance for source language
|
| 115 |
+
if source_language in self.language_enhancements:
|
| 116 |
+
enhanced_queries[source_language] = self._enhance_for_language(
|
| 117 |
+
query, source_language
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
# Create translations for other languages if needed
|
| 121 |
+
if target_language != source_language:
|
| 122 |
+
enhanced_queries[target_language] = self._translate_query(
|
| 123 |
+
query, source_language, target_language
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# Add English version for comprehensive search
|
| 127 |
+
if 'en' not in enhanced_queries:
|
| 128 |
+
if source_language != 'en':
|
| 129 |
+
enhanced_queries['en'] = self._translate_query(query, source_language, 'en')
|
| 130 |
+
else:
|
| 131 |
+
enhanced_queries['en'] = query
|
| 132 |
+
|
| 133 |
+
return enhanced_queries
|
| 134 |
+
|
| 135 |
+
def _enhance_for_language(self, query: str, language: str) -> str:
|
| 136 |
+
"""Enhance query for a specific language"""
|
| 137 |
+
enhancements = self.language_enhancements.get(language, {})
|
| 138 |
+
common_terms = enhancements.get('common_terms', [])
|
| 139 |
+
medical_context = enhancements.get('medical_context', [])
|
| 140 |
+
|
| 141 |
+
# Check if query already contains medical context
|
| 142 |
+
query_lower = query.lower()
|
| 143 |
+
has_medical_context = any(term in query_lower for term in medical_context)
|
| 144 |
+
|
| 145 |
+
# If no medical context, add it
|
| 146 |
+
if not has_medical_context and medical_context:
|
| 147 |
+
# Add the most relevant medical context term
|
| 148 |
+
query += f" {medical_context[0]}"
|
| 149 |
+
|
| 150 |
+
# Check if query is a question and add relevant terms
|
| 151 |
+
if any(term in query_lower for term in ['là gì', '是什么', 'what is', 'how', 'tại sao', '为什么', 'why']):
|
| 152 |
+
if common_terms:
|
| 153 |
+
query += f" {common_terms[0]}" # Add "causes" or equivalent
|
| 154 |
+
|
| 155 |
+
return query.strip()
|
| 156 |
+
|
| 157 |
+
def _translate_query(self, query: str, source_lang: str, target_lang: str) -> str:
|
| 158 |
+
"""Simple keyword-based translation for medical terms"""
|
| 159 |
+
# This is a basic implementation - in production, you'd use a proper translation service
|
| 160 |
+
|
| 161 |
+
# Medical term translations
|
| 162 |
+
translations = {
|
| 163 |
+
('vi', 'en'): {
|
| 164 |
+
'triệu chứng': 'symptoms',
|
| 165 |
+
'đau': 'pain',
|
| 166 |
+
'đau đầu': 'headache',
|
| 167 |
+
'sốt': 'fever',
|
| 168 |
+
'ho': 'cough',
|
| 169 |
+
'điều trị': 'treatment',
|
| 170 |
+
'thuốc': 'medicine',
|
| 171 |
+
'bệnh': 'disease',
|
| 172 |
+
'bác sĩ': 'doctor',
|
| 173 |
+
'sức khỏe': 'health',
|
| 174 |
+
'bệnh viện': 'hospital'
|
| 175 |
+
},
|
| 176 |
+
('zh', 'en'): {
|
| 177 |
+
'症状': 'symptoms',
|
| 178 |
+
'疼痛': 'pain',
|
| 179 |
+
'头痛': 'headache',
|
| 180 |
+
'发烧': 'fever',
|
| 181 |
+
'咳嗽': 'cough',
|
| 182 |
+
'治疗': 'treatment',
|
| 183 |
+
'药物': 'medicine',
|
| 184 |
+
'疾病': 'disease',
|
| 185 |
+
'医生': 'doctor',
|
| 186 |
+
'健康': 'health',
|
| 187 |
+
'医院': 'hospital'
|
| 188 |
+
},
|
| 189 |
+
('en', 'vi'): {
|
| 190 |
+
'symptoms': 'triệu chứng',
|
| 191 |
+
'pain': 'đau',
|
| 192 |
+
'headache': 'đau đầu',
|
| 193 |
+
'fever': 'sốt',
|
| 194 |
+
'cough': 'ho',
|
| 195 |
+
'treatment': 'điều trị',
|
| 196 |
+
'medicine': 'thuốc',
|
| 197 |
+
'disease': 'bệnh',
|
| 198 |
+
'doctor': 'bác sĩ',
|
| 199 |
+
'health': 'sức khỏe',
|
| 200 |
+
'hospital': 'bệnh viện'
|
| 201 |
+
},
|
| 202 |
+
('en', 'zh'): {
|
| 203 |
+
'symptoms': '症状',
|
| 204 |
+
'pain': '疼痛',
|
| 205 |
+
'headache': '头痛',
|
| 206 |
+
'fever': '发烧',
|
| 207 |
+
'cough': '咳嗽',
|
| 208 |
+
'treatment': '治疗',
|
| 209 |
+
'medicine': '药物',
|
| 210 |
+
'disease': '疾病',
|
| 211 |
+
'doctor': '医生',
|
| 212 |
+
'health': '健康',
|
| 213 |
+
'hospital': '医院'
|
| 214 |
+
}
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
translation_map = translations.get((source_lang, target_lang), {})
|
| 218 |
+
|
| 219 |
+
# Simple word-by-word translation
|
| 220 |
+
translated_query = query
|
| 221 |
+
for source_term, target_term in translation_map.items():
|
| 222 |
+
translated_query = translated_query.replace(source_term, target_term)
|
| 223 |
+
|
| 224 |
+
return translated_query
|
| 225 |
+
|
| 226 |
+
def get_medical_relevance_score(self, text: str, language: str) -> float:
|
| 227 |
+
"""Calculate medical relevance score for text in a specific language"""
|
| 228 |
+
if not text:
|
| 229 |
+
return 0.0
|
| 230 |
+
|
| 231 |
+
keywords = self.medical_keywords.get(language, [])
|
| 232 |
+
if not keywords:
|
| 233 |
+
return 0.0
|
| 234 |
+
|
| 235 |
+
text_lower = text.lower()
|
| 236 |
+
matches = sum(1 for keyword in keywords if keyword in text_lower)
|
| 237 |
+
|
| 238 |
+
# Normalize by text length and keyword count
|
| 239 |
+
score = matches / max(len(keywords), 1)
|
| 240 |
+
|
| 241 |
+
# Boost score for longer matches
|
| 242 |
+
if matches > 0:
|
| 243 |
+
score *= (1 + matches * 0.1)
|
| 244 |
+
|
| 245 |
+
return min(score, 1.0)
|
| 246 |
+
|
| 247 |
+
def filter_by_language(self, results: List[Dict], target_language: str) -> List[Dict]:
|
| 248 |
+
"""Filter results by language preference"""
|
| 249 |
+
if not results:
|
| 250 |
+
return results
|
| 251 |
+
|
| 252 |
+
# Score results by language match
|
| 253 |
+
scored_results = []
|
| 254 |
+
for result in results:
|
| 255 |
+
result_language = result.get('language', 'en')
|
| 256 |
+
language_score = 1.0 if result_language == target_language else 0.5
|
| 257 |
+
|
| 258 |
+
# Add language score to result
|
| 259 |
+
result_copy = result.copy()
|
| 260 |
+
result_copy['language_score'] = language_score
|
| 261 |
+
scored_results.append(result_copy)
|
| 262 |
+
|
| 263 |
+
# Sort by language score (prefer target language)
|
| 264 |
+
scored_results.sort(key=lambda x: x.get('language_score', 0), reverse=True)
|
| 265 |
+
|
| 266 |
+
return scored_results
|
search/processors/sources.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Tuple, Set
|
| 3 |
+
import re
|
| 4 |
+
from urllib.parse import urlparse
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
class SourceAggregator:
|
| 10 |
+
"""Aggregate and process sources for comprehensive information extraction"""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
# (Removed credibility scoring; keep placeholder map for future use)
|
| 14 |
+
self.source_credibility = {
|
| 15 |
+
# English sources
|
| 16 |
+
'mayoclinic.org': 0.95,
|
| 17 |
+
'webmd.com': 0.90,
|
| 18 |
+
'healthline.com': 0.88,
|
| 19 |
+
'medlineplus.gov': 0.95,
|
| 20 |
+
'nih.gov': 0.98,
|
| 21 |
+
'cdc.gov': 0.98,
|
| 22 |
+
'who.int': 0.97,
|
| 23 |
+
'pubmed.ncbi.nlm.nih.gov': 0.96,
|
| 24 |
+
'uptodate.com': 0.94,
|
| 25 |
+
'merckmanuals.com': 0.92,
|
| 26 |
+
'medscape.com': 0.89,
|
| 27 |
+
|
| 28 |
+
# Vietnamese sources
|
| 29 |
+
'hellobacsi.com': 0.85,
|
| 30 |
+
'alobacsi.com': 0.82,
|
| 31 |
+
'vinmec.com': 0.88,
|
| 32 |
+
'tamanhhospital.vn': 0.85,
|
| 33 |
+
'medlatec.vn': 0.83,
|
| 34 |
+
'suckhoedoisong.vn': 0.90,
|
| 35 |
+
'viendinhduong.vn': 0.87,
|
| 36 |
+
|
| 37 |
+
# Chinese sources
|
| 38 |
+
'haodf.com': 0.86,
|
| 39 |
+
'dxy.cn': 0.89,
|
| 40 |
+
'chunyuyisheng.com': 0.84,
|
| 41 |
+
'xywy.com': 0.82,
|
| 42 |
+
'jiankang.com': 0.80,
|
| 43 |
+
'familydoctor.com.cn': 0.85,
|
| 44 |
+
|
| 45 |
+
# Video platforms
|
| 46 |
+
'youtube.com': 0.70,
|
| 47 |
+
'medscape.com': 0.89
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
# Source type classification
|
| 51 |
+
self.source_types = {
|
| 52 |
+
'academic': ['nih.gov', 'pubmed.ncbi.nlm.nih.gov', 'who.int', 'cdc.gov'],
|
| 53 |
+
'hospital': ['mayoclinic.org', 'vinmec.com', 'tamanhhospital.vn'],
|
| 54 |
+
'commercial': ['webmd.com', 'healthline.com', 'hellobacsi.com'],
|
| 55 |
+
'government': ['medlineplus.gov', 'suckhoedoisong.vn', 'viendinhduong.vn'],
|
| 56 |
+
'professional': ['dxy.cn', 'medscape.com', 'uptodate.com'],
|
| 57 |
+
'video': ['youtube.com', 'medscape.com']
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
def aggregate_sources(self, search_results: List[Dict], video_results: List[Dict] = None) -> Dict[str, any]:
|
| 61 |
+
"""Aggregate all sources and create comprehensive reference system"""
|
| 62 |
+
all_sources = []
|
| 63 |
+
|
| 64 |
+
# Process search results
|
| 65 |
+
for result in search_results:
|
| 66 |
+
source_info = self._process_source(result)
|
| 67 |
+
if source_info:
|
| 68 |
+
all_sources.append(source_info)
|
| 69 |
+
|
| 70 |
+
# Process video results
|
| 71 |
+
if video_results:
|
| 72 |
+
for video in video_results:
|
| 73 |
+
video_info = self._process_video_source(video)
|
| 74 |
+
if video_info:
|
| 75 |
+
all_sources.append(video_info)
|
| 76 |
+
|
| 77 |
+
# Remove duplicates and score sources
|
| 78 |
+
unique_sources = self._deduplicate_sources(all_sources)
|
| 79 |
+
scored_sources = self._score_sources(unique_sources)
|
| 80 |
+
|
| 81 |
+
# Create comprehensive reference mapping
|
| 82 |
+
reference_mapping = self._create_reference_mapping(scored_sources)
|
| 83 |
+
|
| 84 |
+
# Generate source summary
|
| 85 |
+
source_summary = self._generate_source_summary(scored_sources)
|
| 86 |
+
|
| 87 |
+
return {
|
| 88 |
+
'sources': scored_sources,
|
| 89 |
+
'reference_mapping': reference_mapping,
|
| 90 |
+
'source_summary': source_summary,
|
| 91 |
+
'total_sources': len(scored_sources),
|
| 92 |
+
'languages': self._get_language_distribution(scored_sources),
|
| 93 |
+
'source_types': self._get_source_type_distribution(scored_sources)
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
def _process_source(self, result: Dict) -> Dict:
|
| 97 |
+
"""Process a single search result into standardized source format"""
|
| 98 |
+
url = (result or {}).get('url', '')
|
| 99 |
+
if not url:
|
| 100 |
+
return None
|
| 101 |
+
|
| 102 |
+
domain = self._extract_domain(url)
|
| 103 |
+
source_type = self._classify_source_type(domain)
|
| 104 |
+
# Normalize fields with safe defaults
|
| 105 |
+
title = str(result.get('title', '') or '').strip()
|
| 106 |
+
content = str(result.get('content', '') or '')
|
| 107 |
+
language = (result.get('language') or 'en').lower()
|
| 108 |
+
source_name = str(result.get('source', '') or '')
|
| 109 |
+
platform = str(result.get('platform', '') or '')
|
| 110 |
+
|
| 111 |
+
return {
|
| 112 |
+
'url': url,
|
| 113 |
+
'title': title,
|
| 114 |
+
'content': content,
|
| 115 |
+
'domain': domain,
|
| 116 |
+
'source_type': source_type,
|
| 117 |
+
'language': language,
|
| 118 |
+
'source_name': source_name,
|
| 119 |
+
'platform': platform,
|
| 120 |
+
'type': 'text'
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
def _process_video_source(self, video: Dict) -> Dict:
|
| 124 |
+
"""Process a video result into standardized source format"""
|
| 125 |
+
url = (video or {}).get('url', '')
|
| 126 |
+
if not url:
|
| 127 |
+
return None
|
| 128 |
+
|
| 129 |
+
domain = self._extract_domain(url)
|
| 130 |
+
source_type = 'video'
|
| 131 |
+
title = str(video.get('title', '') or '').strip()
|
| 132 |
+
language = (video.get('language') or 'en').lower()
|
| 133 |
+
source_name = str(video.get('source', '') or '')
|
| 134 |
+
platform = str(video.get('platform', '') or '')
|
| 135 |
+
return {
|
| 136 |
+
'url': url,
|
| 137 |
+
'title': title,
|
| 138 |
+
'content': '', # Videos don't have text content
|
| 139 |
+
'domain': domain,
|
| 140 |
+
'source_type': source_type,
|
| 141 |
+
'language': language,
|
| 142 |
+
'source_name': source_name,
|
| 143 |
+
'platform': platform,
|
| 144 |
+
'type': 'video'
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
def _extract_domain(self, url: str) -> str:
|
| 148 |
+
"""Extract domain from URL"""
|
| 149 |
+
try:
|
| 150 |
+
parsed = urlparse(url)
|
| 151 |
+
domain = parsed.netloc.lower()
|
| 152 |
+
# Remove www. prefix
|
| 153 |
+
if domain.startswith('www.'):
|
| 154 |
+
domain = domain[4:]
|
| 155 |
+
return domain
|
| 156 |
+
except:
|
| 157 |
+
return ''
|
| 158 |
+
|
| 159 |
+
def _classify_source_type(self, domain: str) -> str:
|
| 160 |
+
"""Classify source type based on domain"""
|
| 161 |
+
for source_type, domains in self.source_types.items():
|
| 162 |
+
if domain in domains:
|
| 163 |
+
return source_type
|
| 164 |
+
return 'other'
|
| 165 |
+
|
| 166 |
+
def _get_source_credibility(self, domain: str) -> float:
|
| 167 |
+
"""Deprecated: credibility scoring removed. Kept for compatibility."""
|
| 168 |
+
return 0.0
|
| 169 |
+
|
| 170 |
+
def _deduplicate_sources(self, sources: List[Dict]) -> List[Dict]:
|
| 171 |
+
"""Remove duplicate sources based on URL and title similarity"""
|
| 172 |
+
seen_urls = set()
|
| 173 |
+
seen_titles = set()
|
| 174 |
+
unique_sources = []
|
| 175 |
+
|
| 176 |
+
for source in sources:
|
| 177 |
+
url = source.get('url', '')
|
| 178 |
+
title = source.get('title', '').lower().strip()
|
| 179 |
+
|
| 180 |
+
# Check for URL duplicates
|
| 181 |
+
if url in seen_urls:
|
| 182 |
+
continue
|
| 183 |
+
|
| 184 |
+
# Check for title similarity (fuzzy matching)
|
| 185 |
+
title_similar = any(self._titles_similar(title, seen_title) for seen_title in seen_titles)
|
| 186 |
+
if title_similar:
|
| 187 |
+
continue
|
| 188 |
+
|
| 189 |
+
seen_urls.add(url)
|
| 190 |
+
seen_titles.add(title)
|
| 191 |
+
unique_sources.append(source)
|
| 192 |
+
|
| 193 |
+
return unique_sources
|
| 194 |
+
|
| 195 |
+
def _titles_similar(self, title1: str, title2: str, threshold: float = 0.8) -> bool:
|
| 196 |
+
"""Check if two titles are similar (simple word overlap)"""
|
| 197 |
+
if not title1 or not title2:
|
| 198 |
+
return False
|
| 199 |
+
|
| 200 |
+
words1 = set(title1.split())
|
| 201 |
+
words2 = set(title2.split())
|
| 202 |
+
|
| 203 |
+
if not words1 or not words2:
|
| 204 |
+
return False
|
| 205 |
+
|
| 206 |
+
intersection = words1.intersection(words2)
|
| 207 |
+
union = words1.union(words2)
|
| 208 |
+
|
| 209 |
+
similarity = len(intersection) / len(union) if union else 0
|
| 210 |
+
return similarity >= threshold
|
| 211 |
+
|
| 212 |
+
def _score_sources(self, sources: List[Dict]) -> List[Dict]:
|
| 213 |
+
"""Score and rank sources by relevance and credibility"""
|
| 214 |
+
for source in sources:
|
| 215 |
+
# Calculate composite score
|
| 216 |
+
content_length = len(source.get('content', ''))
|
| 217 |
+
title_length = len(source.get('title', ''))
|
| 218 |
+
|
| 219 |
+
# Content quality score
|
| 220 |
+
content_score = min(content_length / 1000, 1.0) # Normalize to 0-1
|
| 221 |
+
|
| 222 |
+
# Title quality score
|
| 223 |
+
title_score = min(title_length / 100, 1.0) # Normalize to 0-1
|
| 224 |
+
|
| 225 |
+
# Composite score (weighted)
|
| 226 |
+
composite_score = (
|
| 227 |
+
content_score * 0.6 + # 60% content quality
|
| 228 |
+
title_score * 0.4 # 40% title quality
|
| 229 |
+
)
|
| 230 |
+
|
| 231 |
+
source['composite_score'] = composite_score
|
| 232 |
+
|
| 233 |
+
# Sort by composite score
|
| 234 |
+
sources.sort(key=lambda x: x.get('composite_score', 0), reverse=True)
|
| 235 |
+
|
| 236 |
+
return sources
|
| 237 |
+
|
| 238 |
+
def _create_reference_mapping(self, sources: List[Dict]) -> Dict[int, Dict]:
|
| 239 |
+
"""Create reference mapping for citations"""
|
| 240 |
+
reference_mapping = {}
|
| 241 |
+
|
| 242 |
+
for i, source in enumerate(sources, 1):
|
| 243 |
+
reference_mapping[i] = {
|
| 244 |
+
'url': source['url'],
|
| 245 |
+
'title': source['title'],
|
| 246 |
+
'domain': source['domain'],
|
| 247 |
+
'source_type': source['source_type'],
|
| 248 |
+
'language': source['language'],
|
| 249 |
+
'type': source['type']
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
return reference_mapping
|
| 253 |
+
|
| 254 |
+
def _generate_source_summary(self, sources: List[Dict]) -> str:
|
| 255 |
+
"""Generate summary of sources used"""
|
| 256 |
+
if not sources:
|
| 257 |
+
return "No sources available."
|
| 258 |
+
|
| 259 |
+
# Group by source type
|
| 260 |
+
type_counts = defaultdict(int)
|
| 261 |
+
language_counts = defaultdict(int)
|
| 262 |
+
# credibility removed
|
| 263 |
+
|
| 264 |
+
for source in sources:
|
| 265 |
+
source_type = source.get('source_type', 'other')
|
| 266 |
+
language = source.get('language', 'en')
|
| 267 |
+
type_counts[source_type] += 1
|
| 268 |
+
language_counts[language] += 1
|
| 269 |
+
|
| 270 |
+
# Generate summary
|
| 271 |
+
summary_parts = []
|
| 272 |
+
summary_parts.append(f"**Sources Used ({len(sources)} total):**")
|
| 273 |
+
|
| 274 |
+
# Source types
|
| 275 |
+
if type_counts:
|
| 276 |
+
type_summary = ", ".join([f"{count} {type_name}" for type_name, count in type_counts.items()])
|
| 277 |
+
summary_parts.append(f"• **Types**: {type_summary}")
|
| 278 |
+
|
| 279 |
+
# Languages
|
| 280 |
+
if language_counts:
|
| 281 |
+
lang_summary = ", ".join([f"{count} {lang}" for lang, count in language_counts.items()])
|
| 282 |
+
summary_parts.append(f"• **Languages**: {lang_summary}")
|
| 283 |
+
|
| 284 |
+
# Credibility
|
| 285 |
+
# credibility info removed
|
| 286 |
+
|
| 287 |
+
return "\n".join(summary_parts)
|
| 288 |
+
|
| 289 |
+
def _get_language_distribution(self, sources: List[Dict]) -> Dict[str, int]:
|
| 290 |
+
"""Get distribution of sources by language"""
|
| 291 |
+
distribution = defaultdict(int)
|
| 292 |
+
for source in sources:
|
| 293 |
+
language = source.get('language', 'en')
|
| 294 |
+
distribution[language] += 1
|
| 295 |
+
return dict(distribution)
|
| 296 |
+
|
| 297 |
+
def _get_source_type_distribution(self, sources: List[Dict]) -> Dict[str, int]:
|
| 298 |
+
"""Get distribution of sources by type"""
|
| 299 |
+
distribution = defaultdict(int)
|
| 300 |
+
for source in sources:
|
| 301 |
+
source_type = source.get('source_type', 'other')
|
| 302 |
+
distribution[source_type] += 1
|
| 303 |
+
return dict(distribution)
|
| 304 |
+
|
| 305 |
+
def create_comprehensive_references(self, sources: List[Dict], max_references: int = 15) -> str:
|
| 306 |
+
"""Create comprehensive reference list for the response"""
|
| 307 |
+
if not sources:
|
| 308 |
+
return ""
|
| 309 |
+
|
| 310 |
+
# Take top sources
|
| 311 |
+
top_sources = sources[:max_references]
|
| 312 |
+
|
| 313 |
+
reference_parts = []
|
| 314 |
+
reference_parts.append("**📚 References:**")
|
| 315 |
+
|
| 316 |
+
for i, source in enumerate(top_sources, 1):
|
| 317 |
+
url = source.get('url', '')
|
| 318 |
+
title = source.get('title', '')
|
| 319 |
+
domain = source.get('domain', '')
|
| 320 |
+
source_type = source.get('source_type', 'other')
|
| 321 |
+
# credibility removed
|
| 322 |
+
language = source.get('language', 'en')
|
| 323 |
+
source_type_icon = source.get('type', 'other')
|
| 324 |
+
|
| 325 |
+
# Create type indicator
|
| 326 |
+
type_icons = {
|
| 327 |
+
'academic': '🎓',
|
| 328 |
+
'hospital': '🏥',
|
| 329 |
+
'government': '🏛️',
|
| 330 |
+
'commercial': '💼',
|
| 331 |
+
'professional': '👨⚕️',
|
| 332 |
+
'video': '📹',
|
| 333 |
+
'other': '📄'
|
| 334 |
+
}
|
| 335 |
+
type_icon = type_icons.get(source_type, '📄')
|
| 336 |
+
|
| 337 |
+
# Create language indicator
|
| 338 |
+
lang_icons = {
|
| 339 |
+
'en': '🇺🇸',
|
| 340 |
+
'vi': '🇻🇳',
|
| 341 |
+
'zh': '🇨🇳'
|
| 342 |
+
}
|
| 343 |
+
lang_icon = lang_icons.get(language, '🌐')
|
| 344 |
+
|
| 345 |
+
reference_line = f"{i}. {type_icon} {lang_icon} [{title}]({url}) - {domain}"
|
| 346 |
+
reference_parts.append(reference_line)
|
| 347 |
+
|
| 348 |
+
if len(sources) > max_references:
|
| 349 |
+
reference_parts.append(f"... and {len(sources) - max_references} more sources")
|
| 350 |
+
|
| 351 |
+
return "\n".join(reference_parts)
|
| 352 |
+
|
search/search.py
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
from typing import List, Dict, Tuple
|
| 3 |
+
import time
|
| 4 |
+
import hashlib
|
| 5 |
+
from .engines.duckduckgo import DuckDuckGoEngine
|
| 6 |
+
from .engines.video import VideoSearchEngine
|
| 7 |
+
from .coordinator import SearchCoordinator
|
| 8 |
+
# Reranker removed - using simple relevance scoring for cooking content
|
| 9 |
+
from models import summarizer
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
# Global instances
|
| 14 |
+
_duckduckgo_engine = None
|
| 15 |
+
_video_engine = None
|
| 16 |
+
_reranker = None
|
| 17 |
+
_search_coordinator = None
|
| 18 |
+
|
| 19 |
+
# Simple in-memory cache for search results
|
| 20 |
+
_search_cache = {}
|
| 21 |
+
_cache_ttl = 300 # 5 minutes TTL
|
| 22 |
+
|
| 23 |
+
def get_duckduckgo_engine() -> DuckDuckGoEngine:
|
| 24 |
+
"""Get or create the global DuckDuckGo engine instance"""
|
| 25 |
+
global _duckduckgo_engine
|
| 26 |
+
if _duckduckgo_engine is None:
|
| 27 |
+
_duckduckgo_engine = DuckDuckGoEngine()
|
| 28 |
+
return _duckduckgo_engine
|
| 29 |
+
|
| 30 |
+
def get_video_engine() -> VideoSearchEngine:
|
| 31 |
+
"""Get or create the global video engine instance"""
|
| 32 |
+
global _video_engine
|
| 33 |
+
if _video_engine is None:
|
| 34 |
+
_video_engine = VideoSearchEngine()
|
| 35 |
+
return _video_engine
|
| 36 |
+
|
| 37 |
+
def get_reranker():
|
| 38 |
+
"""Simple cooking relevance scorer - no complex reranking needed"""
|
| 39 |
+
return None
|
| 40 |
+
|
| 41 |
+
def get_search_coordinator() -> SearchCoordinator:
|
| 42 |
+
"""Get or create the global search coordinator instance"""
|
| 43 |
+
global _search_coordinator
|
| 44 |
+
if _search_coordinator is None:
|
| 45 |
+
_search_coordinator = SearchCoordinator()
|
| 46 |
+
return _search_coordinator
|
| 47 |
+
|
| 48 |
+
def _clean_search_query(query: str) -> str:
|
| 49 |
+
"""Clean search query by removing bullet points and special characters"""
|
| 50 |
+
if not query:
|
| 51 |
+
return ""
|
| 52 |
+
|
| 53 |
+
import re
|
| 54 |
+
# Remove bullet points and special characters
|
| 55 |
+
cleaned = re.sub(r'[•·▪▫‣⁃]', ' ', query)
|
| 56 |
+
cleaned = re.sub(r'[^\w\s\-\.]', ' ', cleaned)
|
| 57 |
+
cleaned = re.sub(r'\s+', ' ', cleaned)
|
| 58 |
+
cleaned = cleaned.strip()
|
| 59 |
+
|
| 60 |
+
# Remove common prefixes that might confuse search
|
| 61 |
+
prefixes_to_remove = [
|
| 62 |
+
r'^(en|vi|zh)\s*:\s*',
|
| 63 |
+
r'^(search|find|look for)\s+',
|
| 64 |
+
r'^(how to|what is|what are)\s+',
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
for prefix in prefixes_to_remove:
|
| 68 |
+
cleaned = re.sub(prefix, '', cleaned, flags=re.IGNORECASE)
|
| 69 |
+
|
| 70 |
+
return cleaned.strip()
|
| 71 |
+
|
| 72 |
+
def _boost_cooking_keywords(query: str) -> str:
|
| 73 |
+
"""Add cooking context keywords to improve search relevance"""
|
| 74 |
+
if not query:
|
| 75 |
+
return ""
|
| 76 |
+
|
| 77 |
+
# Cooking keywords that boost relevance
|
| 78 |
+
cooking_boosters = [
|
| 79 |
+
'recipe', 'cooking', 'culinary', 'technique', 'how to', 'bake', 'roast', 'sear', 'simmer',
|
| 80 |
+
'ingredients', 'measurements', 'temperature', 'timing', 'substitution', 'variation', 'tips'
|
| 81 |
+
]
|
| 82 |
+
|
| 83 |
+
query_lower = query.lower()
|
| 84 |
+
|
| 85 |
+
# If query doesn't contain cooking terms, add context
|
| 86 |
+
has_cooking = any(term in query_lower for term in cooking_boosters)
|
| 87 |
+
|
| 88 |
+
if not has_cooking:
|
| 89 |
+
# Add cooking context without being too verbose
|
| 90 |
+
if len(query.split()) <= 3:
|
| 91 |
+
return f"{query} cooking recipe technique"
|
| 92 |
+
else:
|
| 93 |
+
return f"{query} cooking tutorial"
|
| 94 |
+
|
| 95 |
+
return query
|
| 96 |
+
|
| 97 |
+
def _get_cache_key(query: str, num_results: int, target_language: str = None, include_videos: bool = True) -> str:
|
| 98 |
+
"""Generate cache key for search results"""
|
| 99 |
+
cache_data = f"{query}_{num_results}_{target_language}_{include_videos}"
|
| 100 |
+
return hashlib.md5(cache_data.encode()).hexdigest()
|
| 101 |
+
|
| 102 |
+
def _get_cached_results(cache_key: str) -> Tuple[str, Dict[int, str], Dict]:
|
| 103 |
+
"""Get cached search results if available and not expired"""
|
| 104 |
+
if cache_key not in _search_cache:
|
| 105 |
+
return None, None, None
|
| 106 |
+
|
| 107 |
+
cached_data = _search_cache[cache_key]
|
| 108 |
+
if time.time() - cached_data['timestamp'] > _cache_ttl:
|
| 109 |
+
# Cache expired
|
| 110 |
+
del _search_cache[cache_key]
|
| 111 |
+
return None, None, None
|
| 112 |
+
|
| 113 |
+
logger.info(f"Using cached search results for key: {cache_key[:8]}...")
|
| 114 |
+
return cached_data['search_context'], cached_data['url_mapping'], cached_data['source_aggregation']
|
| 115 |
+
|
| 116 |
+
def _cache_results(cache_key: str, search_context: str, url_mapping: Dict[int, str], source_aggregation: Dict):
|
| 117 |
+
"""Cache search results"""
|
| 118 |
+
_search_cache[cache_key] = {
|
| 119 |
+
'search_context': search_context,
|
| 120 |
+
'url_mapping': url_mapping,
|
| 121 |
+
'source_aggregation': source_aggregation,
|
| 122 |
+
'timestamp': time.time()
|
| 123 |
+
}
|
| 124 |
+
logger.info(f"Cached search results for key: {cache_key[:8]}...")
|
| 125 |
+
|
| 126 |
+
class WebSearcher:
|
| 127 |
+
"""Legacy wrapper for backward compatibility"""
|
| 128 |
+
def __init__(self):
|
| 129 |
+
self.coordinator = get_search_coordinator()
|
| 130 |
+
self.max_results = 10
|
| 131 |
+
self.timeout = 10
|
| 132 |
+
|
| 133 |
+
def search_google(self, query: str, num_results: int = 10) -> List[Dict]:
|
| 134 |
+
"""Search using the new coordinator system"""
|
| 135 |
+
try:
|
| 136 |
+
cleaned_query = _clean_search_query(query)
|
| 137 |
+
return self.coordinator.quick_search(cleaned_query, num_results)
|
| 138 |
+
except Exception as e:
|
| 139 |
+
logger.error(f"Search failed: {e}")
|
| 140 |
+
return []
|
| 141 |
+
|
| 142 |
+
def search_duckduckgo(self, query: str, num_results: int = 10) -> List[Dict]:
|
| 143 |
+
"""Search using DuckDuckGo engine"""
|
| 144 |
+
try:
|
| 145 |
+
cleaned_query = _clean_search_query(query)
|
| 146 |
+
return self.coordinator.quick_search(cleaned_query, num_results)
|
| 147 |
+
except Exception as e:
|
| 148 |
+
logger.error(f"DuckDuckGo search failed: {e}")
|
| 149 |
+
return []
|
| 150 |
+
|
| 151 |
+
def extract_content(self, url: str) -> str:
|
| 152 |
+
"""Extract content using the new content extractor"""
|
| 153 |
+
try:
|
| 154 |
+
return self.coordinator.content_extractor.extract(url)
|
| 155 |
+
except Exception as e:
|
| 156 |
+
logger.error(f"Content extraction failed: {e}")
|
| 157 |
+
return ""
|
| 158 |
+
|
| 159 |
+
def search_and_extract(self, query: str, num_results: int = 10) -> List[Dict]:
|
| 160 |
+
"""Search and extract content using the new system"""
|
| 161 |
+
try:
|
| 162 |
+
# Clean the query first
|
| 163 |
+
cleaned_query = _clean_search_query(query)
|
| 164 |
+
# Get search results
|
| 165 |
+
results = self.coordinator.quick_search(cleaned_query, num_results)
|
| 166 |
+
|
| 167 |
+
# Extract content for each result
|
| 168 |
+
enriched_results = []
|
| 169 |
+
for result in results:
|
| 170 |
+
content = self.extract_content(result['url'])
|
| 171 |
+
if content:
|
| 172 |
+
enriched_result = result.copy()
|
| 173 |
+
enriched_result['content'] = content
|
| 174 |
+
enriched_results.append(enriched_result)
|
| 175 |
+
return enriched_results
|
| 176 |
+
except Exception as e:
|
| 177 |
+
logger.error(f"Search and extract failed: {e}")
|
| 178 |
+
return []
|
| 179 |
+
|
| 180 |
+
# Main search function for backward compatibility
|
| 181 |
+
def search_web(query: str, num_results: int = 10) -> List[Dict]:
|
| 182 |
+
"""Main search function using the new coordinator system"""
|
| 183 |
+
try:
|
| 184 |
+
# Clean the query first
|
| 185 |
+
cleaned_query = _clean_search_query(query)
|
| 186 |
+
coordinator = get_search_coordinator()
|
| 187 |
+
return coordinator.quick_search(cleaned_query, num_results)
|
| 188 |
+
except Exception as e:
|
| 189 |
+
logger.error(f"Web search failed: {e}")
|
| 190 |
+
return []
|
| 191 |
+
|
| 192 |
+
# Enhanced search function with content extraction
|
| 193 |
+
def search_web_with_content(query: str, num_results: int = 10) -> Tuple[str, Dict[int, str]]:
|
| 194 |
+
"""Enhanced search with content extraction and summarization"""
|
| 195 |
+
try:
|
| 196 |
+
# Clean the query first
|
| 197 |
+
cleaned_query = _clean_search_query(query)
|
| 198 |
+
coordinator = get_search_coordinator()
|
| 199 |
+
return coordinator.search(cleaned_query, num_results)
|
| 200 |
+
except Exception as e:
|
| 201 |
+
logger.error(f"Enhanced web search failed: {e}")
|
| 202 |
+
return "", {}
|
| 203 |
+
|
| 204 |
+
# Cooking-focused search function
|
| 205 |
+
def search_cooking(query: str, num_results: int = 8) -> Tuple[str, Dict[int, str]]:
|
| 206 |
+
"""Cooking-focused search with enhanced processing"""
|
| 207 |
+
try:
|
| 208 |
+
# Clean the query first
|
| 209 |
+
cleaned_query = _clean_search_query(query)
|
| 210 |
+
coordinator = get_search_coordinator()
|
| 211 |
+
return coordinator.cooking_focus_search(cleaned_query, num_results)
|
| 212 |
+
except Exception as e:
|
| 213 |
+
logger.error(f"Cooking search failed: {e}")
|
| 214 |
+
return "", {}
|
| 215 |
+
|
| 216 |
+
# Multilingual cooking search function
|
| 217 |
+
def search_multilingual_cooking(query: str, num_results: int = 10, target_language: str = None) -> Tuple[str, Dict[int, str]]:
|
| 218 |
+
"""Comprehensive multilingual cooking search supporting English, Vietnamese, and Chinese"""
|
| 219 |
+
try:
|
| 220 |
+
# Clean the query first
|
| 221 |
+
cleaned_query = _clean_search_query(query)
|
| 222 |
+
coordinator = get_search_coordinator()
|
| 223 |
+
return coordinator.multilingual_cooking_search(cleaned_query, num_results, target_language)
|
| 224 |
+
except Exception as e:
|
| 225 |
+
logger.error(f"Multilingual cooking search failed: {e}")
|
| 226 |
+
return "", {}
|
| 227 |
+
|
| 228 |
+
# Video search function
|
| 229 |
+
def search_videos(query: str, num_results: int = 2, target_language: str = None) -> List[Dict]:
|
| 230 |
+
"""Search for cooking videos across multiple platforms"""
|
| 231 |
+
try:
|
| 232 |
+
# Clean the query first
|
| 233 |
+
cleaned_query = _clean_search_query(query)
|
| 234 |
+
coordinator = get_search_coordinator()
|
| 235 |
+
return coordinator.video_search(cleaned_query, num_results, target_language)
|
| 236 |
+
except Exception as e:
|
| 237 |
+
logger.error(f"Video search failed: {e}")
|
| 238 |
+
return []
|
| 239 |
+
|
| 240 |
+
# Comprehensive search function with maximum information extraction
|
| 241 |
+
def search_comprehensive(query: str, num_results: int = 15, target_language: str = None, include_videos: bool = True) -> Tuple[str, Dict[int, str], Dict]:
|
| 242 |
+
"""Comprehensive search with maximum information extraction and detailed references"""
|
| 243 |
+
logger.info(f"Starting comprehensive search for: {query} (target: {target_language})")
|
| 244 |
+
|
| 245 |
+
# Check cache first
|
| 246 |
+
cache_key = _get_cache_key(query, num_results, target_language, include_videos)
|
| 247 |
+
cached_context, cached_mapping, cached_aggregation = _get_cached_results(cache_key)
|
| 248 |
+
if cached_context is not None:
|
| 249 |
+
return cached_context, cached_mapping, cached_aggregation
|
| 250 |
+
|
| 251 |
+
# Clean and boost the query for better cooking relevance
|
| 252 |
+
cleaned_query = _clean_search_query(query)
|
| 253 |
+
boosted_query = _boost_cooking_keywords(cleaned_query)
|
| 254 |
+
logger.info(f"Query processing: '{query}' -> '{cleaned_query}' -> '{boosted_query}'")
|
| 255 |
+
|
| 256 |
+
# Get engines
|
| 257 |
+
duckduckgo_engine = get_duckduckgo_engine()
|
| 258 |
+
video_engine = get_video_engine()
|
| 259 |
+
reranker = get_reranker()
|
| 260 |
+
|
| 261 |
+
# Optimized search strategy: get just enough results for good filtering
|
| 262 |
+
# Calculate optimal initial count based on expected filtering ratio
|
| 263 |
+
expected_filter_ratio = 0.4 # Expect to keep ~40% after filtering
|
| 264 |
+
optimal_initial_count = max(num_results * 2, int(num_results / expected_filter_ratio))
|
| 265 |
+
|
| 266 |
+
# Search for text results with optimized count
|
| 267 |
+
text_results = duckduckgo_engine.search(boosted_query, optimal_initial_count)
|
| 268 |
+
logger.info(f"Found {len(text_results)} text results (requested {optimal_initial_count})")
|
| 269 |
+
|
| 270 |
+
# If no text results, try simple fallback search
|
| 271 |
+
if not text_results:
|
| 272 |
+
logger.warning("No text results found, trying simple fallback search")
|
| 273 |
+
try:
|
| 274 |
+
# Try with a very simple query
|
| 275 |
+
simple_query = " ".join(cleaned_query.split()[:3]) # First 3 words only
|
| 276 |
+
text_results = duckduckgo_engine.search(simple_query, num_results)
|
| 277 |
+
logger.info(f"Simple fallback found {len(text_results)} results")
|
| 278 |
+
except Exception as e:
|
| 279 |
+
logger.warning(f"Simple fallback search failed: {e}")
|
| 280 |
+
|
| 281 |
+
# Search for videos if requested (limit to avoid over-fetching)
|
| 282 |
+
video_results = []
|
| 283 |
+
if include_videos:
|
| 284 |
+
try:
|
| 285 |
+
# Map language codes for video search
|
| 286 |
+
lang_mapping = {
|
| 287 |
+
'EN': 'en',
|
| 288 |
+
'VI': 'vi',
|
| 289 |
+
'ZH': 'zh',
|
| 290 |
+
'en': 'en',
|
| 291 |
+
'vi': 'vi',
|
| 292 |
+
'zh': 'zh'
|
| 293 |
+
}
|
| 294 |
+
search_language = lang_mapping.get(target_language, 'en')
|
| 295 |
+
# Limit video results to avoid over-fetching
|
| 296 |
+
max_video_results = min(5, num_results // 3) # Max 5 or 1/3 of total
|
| 297 |
+
video_results = video_engine.search(boosted_query, num_results=max_video_results, language=search_language)
|
| 298 |
+
logger.info(f"Found {len(video_results)} video results")
|
| 299 |
+
except Exception as e:
|
| 300 |
+
logger.warning(f"Video search failed: {e}")
|
| 301 |
+
|
| 302 |
+
# Combine all results
|
| 303 |
+
all_results = text_results + video_results
|
| 304 |
+
|
| 305 |
+
# Simple cooking relevance filtering
|
| 306 |
+
if all_results:
|
| 307 |
+
# Filter by cooking relevance using simple keyword matching
|
| 308 |
+
cooking_keywords = ['recipe', 'cooking', 'baking', 'food', 'ingredient', 'kitchen', 'chef', 'meal', 'dish', 'cuisine', 'cook', 'bake', 'roast', 'grill', 'fry', 'boil', 'steam', 'season', 'spice', 'herb', 'sauce', 'marinade', 'dressing']
|
| 309 |
+
relevant_results = []
|
| 310 |
+
for result in all_results:
|
| 311 |
+
title = result.get('title', '').lower()
|
| 312 |
+
content = result.get('content', '').lower()
|
| 313 |
+
if any(keyword in title or keyword in content for keyword in cooking_keywords):
|
| 314 |
+
relevant_results.append(result)
|
| 315 |
+
|
| 316 |
+
if relevant_results:
|
| 317 |
+
all_results = relevant_results
|
| 318 |
+
logger.info(f"Filtered to {len(all_results)} cooking-relevant results")
|
| 319 |
+
|
| 320 |
+
# Limit final results to requested count
|
| 321 |
+
all_results = all_results[:num_results]
|
| 322 |
+
|
| 323 |
+
# Final safety check - ensure we have at least some results
|
| 324 |
+
if not all_results and text_results:
|
| 325 |
+
logger.warning("No results after processing, using original text results as fallback")
|
| 326 |
+
all_results = text_results[:num_results]
|
| 327 |
+
|
| 328 |
+
# Create URL mapping
|
| 329 |
+
url_mapping = {}
|
| 330 |
+
for i, result in enumerate(all_results, 1):
|
| 331 |
+
url_mapping[i] = result.get('url', '')
|
| 332 |
+
|
| 333 |
+
# Create search context using summarizer (only for top results)
|
| 334 |
+
search_context = ""
|
| 335 |
+
if all_results:
|
| 336 |
+
summaries = []
|
| 337 |
+
# Only summarize top results to avoid over-processing
|
| 338 |
+
top_results = all_results[:min(10, len(all_results))]
|
| 339 |
+
for i, result in enumerate(top_results, 1):
|
| 340 |
+
content = result.get('content', '') or result.get('title', '')
|
| 341 |
+
if content:
|
| 342 |
+
# Use query-focused summarization
|
| 343 |
+
summary = summarizer.summarize_for_query(content, boosted_query, max_length=300)
|
| 344 |
+
if summary:
|
| 345 |
+
summaries.append(f"Document {i}: {summary}")
|
| 346 |
+
|
| 347 |
+
search_context = "\n\n".join(summaries)
|
| 348 |
+
|
| 349 |
+
# Create source aggregation
|
| 350 |
+
source_aggregation = {
|
| 351 |
+
'total_sources': len(all_results),
|
| 352 |
+
'text_sources': len(text_results),
|
| 353 |
+
'video_sources': len(video_results),
|
| 354 |
+
'sources': all_results
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
logger.info(f"Comprehensive search completed: {len(all_results)} total sources")
|
| 358 |
+
|
| 359 |
+
# Cache the results
|
| 360 |
+
_cache_results(cache_key, search_context, url_mapping, source_aggregation)
|
| 361 |
+
|
| 362 |
+
return search_context, url_mapping, source_aggregation
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Utils package
|
| 2 |
+
from .translation import translate_query
|
| 3 |
+
from .vlm import process_medical_image
|
| 4 |
+
from .diagnosis import retrieve_diagnosis_from_symptoms
|
utils/migrate.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Running this script to split FAISS index collection to the second/different cluster.
|
| 2 |
+
from pymongo import MongoClient
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
def migrate_faiss_index():
|
| 7 |
+
"""Migrate FAISS index from QA cluster to index cluster"""
|
| 8 |
+
# Load environment variables from .env
|
| 9 |
+
load_dotenv()
|
| 10 |
+
# Connection strings (update as needed)
|
| 11 |
+
mongo_uri = os.getenv("MONGO_URI") # QA cluster connection string
|
| 12 |
+
index_uri = os.getenv("INDEX_URI") # FAISS index cluster connection string
|
| 13 |
+
|
| 14 |
+
if not mongo_uri:
|
| 15 |
+
raise ValueError("MONGO_URI is missing!")
|
| 16 |
+
if not index_uri:
|
| 17 |
+
raise ValueError("INDEX_URI is missing!")
|
| 18 |
+
|
| 19 |
+
# Connect to the QA cluster (where FAISS data was accidentally stored)
|
| 20 |
+
qa_client = MongoClient(mongo_uri)
|
| 21 |
+
qa_db = qa_client["MedicalChatbotDB"]
|
| 22 |
+
|
| 23 |
+
# Connect to the FAISS index cluster
|
| 24 |
+
faiss_client = MongoClient(index_uri)
|
| 25 |
+
faiss_db = faiss_client["MedicalChatbotDB"] # Use the same database name if desired
|
| 26 |
+
|
| 27 |
+
# Define the GridFS collections to move.
|
| 28 |
+
# In GridFS, files are stored in two collections: "<bucket>.files" and "<bucket>.chunks".
|
| 29 |
+
source_files = qa_db["faiss_index_files.files"]
|
| 30 |
+
source_chunks = qa_db["faiss_index_files.chunks"]
|
| 31 |
+
|
| 32 |
+
dest_files = faiss_db["faiss_index_files.files"]
|
| 33 |
+
dest_chunks = faiss_db["faiss_index_files.chunks"]
|
| 34 |
+
|
| 35 |
+
print("Moving FAISS index GridFS files...")
|
| 36 |
+
|
| 37 |
+
# Copy documents from the source 'files' collection
|
| 38 |
+
for doc in source_files.find():
|
| 39 |
+
dest_files.insert_one(doc)
|
| 40 |
+
|
| 41 |
+
# Copy documents from the source 'chunks' collection
|
| 42 |
+
for doc in source_chunks.find():
|
| 43 |
+
dest_chunks.insert_one(doc)
|
| 44 |
+
|
| 45 |
+
print("✅ FAISS GridFS collections moved successfully.")
|
| 46 |
+
|
| 47 |
+
# Optionally, drop the old collections from the QA cluster to free up space:
|
| 48 |
+
qa_db.drop_collection("faiss_index_files.files")
|
| 49 |
+
qa_db.drop_collection("faiss_index_files.chunks")
|
| 50 |
+
print("Old FAISS GridFS collections dropped from the QA cluster.")
|
| 51 |
+
|
| 52 |
+
# Only run when called directly
|
| 53 |
+
if __name__ == "__main__":
|
| 54 |
+
migrate_faiss_index()
|
utils/symbipredict_2022.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
utils/translation.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# translation.py
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
import logging
|
| 4 |
+
import re
|
| 5 |
+
from collections import Counter
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger("translation-agent")
|
| 8 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True) # Change INFO to DEBUG for full-ctx JSON loader
|
| 9 |
+
|
| 10 |
+
# To use lazy model loader
|
| 11 |
+
vi_en = None
|
| 12 |
+
zh_en = None
|
| 13 |
+
|
| 14 |
+
def _dedupe_repeats(s: str, n_min: int = 3, n_max: int = 7) -> str:
|
| 15 |
+
"""Collapse excessive repeated n-grams and repeated phrases with improved logic."""
|
| 16 |
+
if not s:
|
| 17 |
+
return s
|
| 18 |
+
|
| 19 |
+
# Collapse repeated spaces/newlines
|
| 20 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 21 |
+
|
| 22 |
+
# More aggressive repetition detection
|
| 23 |
+
# Check for simple word repetition (like "a lot of people do not" repeated)
|
| 24 |
+
words = s.split()
|
| 25 |
+
if len(words) > 20: # Only check if text is long enough
|
| 26 |
+
# Look for repeated sequences of 3-8 words
|
| 27 |
+
for seq_len in range(8, 2, -1):
|
| 28 |
+
if len(words) < seq_len * 3: # Need at least 3 repetitions
|
| 29 |
+
continue
|
| 30 |
+
|
| 31 |
+
# Check each possible starting position
|
| 32 |
+
for start in range(len(words) - seq_len * 2):
|
| 33 |
+
sequence = words[start:start + seq_len]
|
| 34 |
+
# Count how many times this sequence repeats
|
| 35 |
+
repeat_count = 1
|
| 36 |
+
pos = start + seq_len
|
| 37 |
+
while pos + seq_len <= len(words):
|
| 38 |
+
if words[pos:pos + seq_len] == sequence:
|
| 39 |
+
repeat_count += 1
|
| 40 |
+
pos += seq_len
|
| 41 |
+
else:
|
| 42 |
+
break
|
| 43 |
+
|
| 44 |
+
# If we found 3+ repetitions, remove the excess
|
| 45 |
+
if repeat_count >= 3:
|
| 46 |
+
# Keep only the first occurrence
|
| 47 |
+
new_words = words[:start + seq_len] + words[start + seq_len * repeat_count:]
|
| 48 |
+
s = " ".join(new_words)
|
| 49 |
+
words = s.split()
|
| 50 |
+
break
|
| 51 |
+
else:
|
| 52 |
+
continue
|
| 53 |
+
break # Break outer loop if we found and fixed a repetition
|
| 54 |
+
|
| 55 |
+
# Additional cleanup for remaining patterns
|
| 56 |
+
# Remove consecutive identical word
|
| 57 |
+
tokens = s.split()
|
| 58 |
+
out = []
|
| 59 |
+
last = None
|
| 60 |
+
for t in tokens:
|
| 61 |
+
if last is None or t.lower() != last.lower():
|
| 62 |
+
out.append(t)
|
| 63 |
+
last = t
|
| 64 |
+
s = " ".join(out)
|
| 65 |
+
|
| 66 |
+
# Limit consecutive duplicate n-grams
|
| 67 |
+
for n in range(n_max, n_min - 1, -1):
|
| 68 |
+
pattern = re.compile(r"(\b(?:\w+\s+){%d}\w+\b)(?:\s+\1){2,}" % (n - 1), flags=re.IGNORECASE)
|
| 69 |
+
s = pattern.sub(r"\1", s)
|
| 70 |
+
|
| 71 |
+
return s
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def _normalize_and_cap(s: str, cap: int = 512) -> str:
|
| 75 |
+
if not s:
|
| 76 |
+
return s
|
| 77 |
+
s = s.strip()
|
| 78 |
+
if len(s) > cap:
|
| 79 |
+
s = s[:cap]
|
| 80 |
+
return s
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _is_too_repetitive(s: str, threshold: float = 0.4) -> bool:
|
| 84 |
+
if not s:
|
| 85 |
+
return False
|
| 86 |
+
tokens = [t.lower() for t in s.split()]
|
| 87 |
+
if len(tokens) < 10:
|
| 88 |
+
return False
|
| 89 |
+
counts = Counter(tokens)
|
| 90 |
+
top = counts.most_common(1)[0][1]
|
| 91 |
+
return (top / max(1, len(tokens))) >= threshold
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def translate_query(text: str, lang_code: str) -> str:
|
| 95 |
+
global vi_en, zh_en
|
| 96 |
+
|
| 97 |
+
if not text or not text.strip():
|
| 98 |
+
return text
|
| 99 |
+
|
| 100 |
+
try:
|
| 101 |
+
if lang_code == "vi":
|
| 102 |
+
if vi_en is None:
|
| 103 |
+
logger.info("[Translation] Loading Vietnamese-English model...")
|
| 104 |
+
vi_en = pipeline("translation", model="VietAI/envit5-translation", src_lang="vi", tgt_lang="en", device=-1)
|
| 105 |
+
|
| 106 |
+
# Limit input length to prevent model issues
|
| 107 |
+
input_text = text[:1000] if len(text) > 1000 else text
|
| 108 |
+
raw = vi_en(input_text, max_length=512)[0]["translation_text"]
|
| 109 |
+
cleaned = _dedupe_repeats(raw)
|
| 110 |
+
norm = _normalize_and_cap(cleaned, cap=512)
|
| 111 |
+
|
| 112 |
+
if _is_too_repetitive(norm) or len(norm.strip()) < 10:
|
| 113 |
+
logger.warning("[En-Vi] Translation repetitive or too short; falling back to original text")
|
| 114 |
+
return text
|
| 115 |
+
|
| 116 |
+
logger.info(f"[En-Vi] Query in `{lang_code}` translated to: {norm[:100]}...")
|
| 117 |
+
return norm
|
| 118 |
+
|
| 119 |
+
elif lang_code == "zh":
|
| 120 |
+
if zh_en is None:
|
| 121 |
+
logger.info("[Translation] Loading Chinese-English model...")
|
| 122 |
+
zh_en = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en", device=-1)
|
| 123 |
+
|
| 124 |
+
# Limit input length to prevent model issues
|
| 125 |
+
input_text = text[:1000] if len(text) > 1000 else text
|
| 126 |
+
raw = zh_en(input_text, max_length=512)[0]["translation_text"]
|
| 127 |
+
cleaned = _dedupe_repeats(raw)
|
| 128 |
+
norm = _normalize_and_cap(cleaned, cap=512)
|
| 129 |
+
|
| 130 |
+
if _is_too_repetitive(norm) or len(norm.strip()) < 10:
|
| 131 |
+
logger.warning("[En-Zh] Translation repetitive or too short; falling back to original text")
|
| 132 |
+
return text
|
| 133 |
+
|
| 134 |
+
logger.info(f"[En-Zh] Query in `{lang_code}` translated to: {norm[:100]}...")
|
| 135 |
+
return norm
|
| 136 |
+
|
| 137 |
+
except Exception as e:
|
| 138 |
+
logger.error(f"[Translation] Translation failed for {lang_code}: {e}")
|
| 139 |
+
return text # Fallback to original text
|
| 140 |
+
|
| 141 |
+
return text
|
utils/vlm.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os, logging, traceback, json, base64
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
from PIL import Image
|
| 4 |
+
from .translation import translate_query
|
| 5 |
+
from gradio_client import Client, handle_file
|
| 6 |
+
import tempfile
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger("vlm-agent")
|
| 9 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s — %(name)s — %(levelname)s — %(message)s", force=True)
|
| 10 |
+
|
| 11 |
+
# ✅ Load Gradio client once
|
| 12 |
+
gr_client = None
|
| 13 |
+
def load_gradio_client():
|
| 14 |
+
global gr_client
|
| 15 |
+
if gr_client is None:
|
| 16 |
+
logger.info("[VLM] ⏳ Connecting to MedGEMMA Gradio Space...")
|
| 17 |
+
gr_client = Client("warshanks/medgemma-4b-it")
|
| 18 |
+
logger.info("[VLM] Gradio MedGEMMA client ready.")
|
| 19 |
+
return gr_client
|
| 20 |
+
|
| 21 |
+
def process_medical_image(base64_image: str, prompt: str = None, lang: str = "EN") -> str:
|
| 22 |
+
if not prompt:
|
| 23 |
+
prompt = "Describe and investigate any clinical findings from this medical image."
|
| 24 |
+
elif lang.upper() in {"VI", "ZH"}:
|
| 25 |
+
prompt = translate_query(prompt, lang.lower())
|
| 26 |
+
|
| 27 |
+
try:
|
| 28 |
+
# 1️⃣ Decode base64 image to temp file
|
| 29 |
+
image_data = base64.b64decode(base64_image)
|
| 30 |
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
| 31 |
+
tmp.write(image_data)
|
| 32 |
+
tmp.flush()
|
| 33 |
+
image_path = tmp.name
|
| 34 |
+
|
| 35 |
+
# 2️⃣ Send to Gradio MedGEMMA
|
| 36 |
+
client = load_gradio_client()
|
| 37 |
+
logger.info(f"[VLM] Sending prompt: {prompt}")
|
| 38 |
+
result = client.predict(
|
| 39 |
+
message={"text": prompt, "files": [handle_file(image_path)]},
|
| 40 |
+
param_2 = "You analyze medical images and report abnormalities, diseases with clear diagnostic insight.",
|
| 41 |
+
param_3=2048,
|
| 42 |
+
api_name="/chat"
|
| 43 |
+
)
|
| 44 |
+
if isinstance(result, str):
|
| 45 |
+
logger.info(f"[VLM] ✅ Response: {result}")
|
| 46 |
+
return result.strip()
|
| 47 |
+
else:
|
| 48 |
+
logger.warning(f"[VLM] ⚠️ Unexpected result type: {type(result)} — {result}")
|
| 49 |
+
return str(result)
|
| 50 |
+
|
| 51 |
+
except Exception as e:
|
| 52 |
+
logger.error(f"[VLM] ❌ Exception: {e}")
|
| 53 |
+
logger.error(f"[VLM] 🔍 Traceback:\n{traceback.format_exc()}")
|
| 54 |
+
return f"[VLM] ⚠️ Failed to process image: {e}"
|