Spaces:
Sleeping
Sleeping
Prathamesh Sutar
commited on
Commit
·
49e67a8
1
Parent(s):
6675816
Initial deployment of Pravaah Ocean Hazard Detection System
Browse files- .gitattributes +0 -35
- .gitignore +53 -0
- DEPLOYMENT_GUIDE.md +189 -0
- Dockerfile +48 -0
- README.md +139 -6
- api.py +252 -0
- app.py +197 -0
- classifier.py +52 -0
- ner.py +125 -0
- pg_db.py +133 -0
- requirements.txt +29 -0
- scraper.py +92 -0
- sentiment.py +59 -0
- translate.py +69 -0
.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment variables
|
| 2 |
+
.env
|
| 3 |
+
.env.local
|
| 4 |
+
.env.production
|
| 5 |
+
|
| 6 |
+
# Python
|
| 7 |
+
__pycache__/
|
| 8 |
+
*.py[cod]
|
| 9 |
+
*$py.class
|
| 10 |
+
*.so
|
| 11 |
+
.Python
|
| 12 |
+
build/
|
| 13 |
+
develop-eggs/
|
| 14 |
+
dist/
|
| 15 |
+
downloads/
|
| 16 |
+
eggs/
|
| 17 |
+
.eggs/
|
| 18 |
+
lib/
|
| 19 |
+
lib64/
|
| 20 |
+
parts/
|
| 21 |
+
sdist/
|
| 22 |
+
var/
|
| 23 |
+
wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
|
| 28 |
+
# Virtual environments
|
| 29 |
+
venv/
|
| 30 |
+
env/
|
| 31 |
+
ENV/
|
| 32 |
+
class/
|
| 33 |
+
|
| 34 |
+
# IDE
|
| 35 |
+
.vscode/
|
| 36 |
+
.idea/
|
| 37 |
+
*.swp
|
| 38 |
+
*.swo
|
| 39 |
+
|
| 40 |
+
# OS
|
| 41 |
+
.DS_Store
|
| 42 |
+
Thumbs.db
|
| 43 |
+
|
| 44 |
+
# Logs
|
| 45 |
+
*.log
|
| 46 |
+
|
| 47 |
+
# Database
|
| 48 |
+
*.db
|
| 49 |
+
*.sqlite3
|
| 50 |
+
|
| 51 |
+
# Model cache (optional - uncomment if you want to ignore cached models)
|
| 52 |
+
# .cache/
|
| 53 |
+
# models/
|
DEPLOYMENT_GUIDE.md
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 Pravaah Deployment Guide
|
| 2 |
+
|
| 3 |
+
This guide will help you deploy the Pravaah Ocean Hazard Detection System with both FastAPI and Gradio interfaces.
|
| 4 |
+
|
| 5 |
+
## 📁 **Files in Pravaah Folder**
|
| 6 |
+
|
| 7 |
+
### **Core Application Files:**
|
| 8 |
+
- **`app.py`** - Gradio web interface (Port 7860)
|
| 9 |
+
- **`api.py`** - FastAPI REST API (Port 8000)
|
| 10 |
+
- **`Dockerfile`** - Docker configuration for both services
|
| 11 |
+
- **`requirements.txt`** - Python dependencies
|
| 12 |
+
|
| 13 |
+
### **AI/ML Modules:**
|
| 14 |
+
- **`classifier.py`** - Hazard classification
|
| 15 |
+
- **`scraper.py`** - Twitter data fetching
|
| 16 |
+
- **`ner.py`** - Named Entity Recognition
|
| 17 |
+
- **`sentiment.py`** - Sentiment analysis
|
| 18 |
+
- **`translate.py`** - Translation pipeline
|
| 19 |
+
- **`pg_db.py`** - Database operations
|
| 20 |
+
|
| 21 |
+
### **Documentation:**
|
| 22 |
+
- **`README.md`** - Project documentation
|
| 23 |
+
- **`DEPLOYMENT_GUIDE.md`** - This file
|
| 24 |
+
|
| 25 |
+
## 🌐 **Services Overview**
|
| 26 |
+
|
| 27 |
+
### **FastAPI (Port 8000)**
|
| 28 |
+
- **REST API** for programmatic access
|
| 29 |
+
- **Swagger UI** at `/docs`
|
| 30 |
+
- **ReDoc** at `/redoc`
|
| 31 |
+
- **Health checks** at `/health`
|
| 32 |
+
|
| 33 |
+
### **Gradio (Port 7860)**
|
| 34 |
+
- **Web interface** for interactive use
|
| 35 |
+
- **Real-time analysis** with visual results
|
| 36 |
+
- **JSON export** functionality
|
| 37 |
+
|
| 38 |
+
## 🚀 **Deployment Steps**
|
| 39 |
+
|
| 40 |
+
### **1. Deploy to Hugging Face Spaces**
|
| 41 |
+
|
| 42 |
+
1. **Create a new Space:**
|
| 43 |
+
- Go to [huggingface.co/spaces](https://huggingface.co/spaces)
|
| 44 |
+
- Choose **Docker** SDK
|
| 45 |
+
- Name: `pravaah-ocean-hazard-detection`
|
| 46 |
+
|
| 47 |
+
2. **Upload files:**
|
| 48 |
+
- Copy all files from the `pravaah` folder
|
| 49 |
+
- Upload to your Space repository
|
| 50 |
+
|
| 51 |
+
3. **Set environment variables:**
|
| 52 |
+
- `TWITTER_API_KEY` - Your Twitter API key
|
| 53 |
+
- `SUPABASE_URL` - Your Supabase connection string
|
| 54 |
+
|
| 55 |
+
4. **Deploy:**
|
| 56 |
+
- Push to repository
|
| 57 |
+
- Monitor build logs
|
| 58 |
+
|
| 59 |
+
### **2. Local Development**
|
| 60 |
+
|
| 61 |
+
```bash
|
| 62 |
+
# Install dependencies
|
| 63 |
+
pip install -r requirements.txt
|
| 64 |
+
|
| 65 |
+
# Run FastAPI
|
| 66 |
+
uvicorn api:app --host 0.0.0.0 --port 8000
|
| 67 |
+
|
| 68 |
+
# Run Gradio (in another terminal)
|
| 69 |
+
python app.py
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
### **3. Docker Deployment**
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
# Build image
|
| 76 |
+
docker build -t pravaah-ocean-hazard .
|
| 77 |
+
|
| 78 |
+
# Run container
|
| 79 |
+
docker run -p 8000:8000 -p 7860:7860 \
|
| 80 |
+
-e TWITTER_API_KEY=your_key \
|
| 81 |
+
-e SUPABASE_URL=your_url \
|
| 82 |
+
pravaah-ocean-hazard
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
## 📊 **API Endpoints**
|
| 86 |
+
|
| 87 |
+
### **Analysis**
|
| 88 |
+
- **POST** `/analyze` - Analyze tweets for hazards
|
| 89 |
+
- **GET** `/hazardous-tweets` - Get stored hazardous tweets
|
| 90 |
+
- **GET** `/stats` - Get analysis statistics
|
| 91 |
+
|
| 92 |
+
### **Health & Monitoring**
|
| 93 |
+
- **GET** `/health` - Health check
|
| 94 |
+
- **GET** `/` - Root endpoint
|
| 95 |
+
|
| 96 |
+
## 🔧 **Configuration**
|
| 97 |
+
|
| 98 |
+
### **Environment Variables**
|
| 99 |
+
```bash
|
| 100 |
+
# Required
|
| 101 |
+
TWITTER_API_KEY=your_twitter_api_key
|
| 102 |
+
SUPABASE_URL=postgresql://postgres:[password]@db.[project-ref].supabase.co:5432/postgres
|
| 103 |
+
|
| 104 |
+
# Optional
|
| 105 |
+
SUPABASE_ANON_KEY=your_anon_key
|
| 106 |
+
SUPABASE_SERVICE_ROLE_KEY=your_service_role_key
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
### **Ports**
|
| 110 |
+
- **8000** - FastAPI REST API
|
| 111 |
+
- **7860** - Gradio Web Interface
|
| 112 |
+
|
| 113 |
+
## 🧪 **Testing**
|
| 114 |
+
|
| 115 |
+
### **Test API Endpoints**
|
| 116 |
+
```bash
|
| 117 |
+
# Health check
|
| 118 |
+
curl http://localhost:8000/health
|
| 119 |
+
|
| 120 |
+
# Analyze tweets
|
| 121 |
+
curl -X POST "http://localhost:8000/analyze" \
|
| 122 |
+
-H "Content-Type: application/json" \
|
| 123 |
+
-d '{"limit": 10}'
|
| 124 |
+
|
| 125 |
+
# Get statistics
|
| 126 |
+
curl http://localhost:8000/stats
|
| 127 |
+
```
|
| 128 |
+
|
| 129 |
+
### **Test Web Interface**
|
| 130 |
+
- Open `http://localhost:7860` in browser
|
| 131 |
+
- Use the interactive interface to analyze tweets
|
| 132 |
+
|
| 133 |
+
## 📈 **Monitoring**
|
| 134 |
+
|
| 135 |
+
### **Health Checks**
|
| 136 |
+
- **FastAPI**: `http://localhost:8000/health`
|
| 137 |
+
- **Gradio**: Check if port 7860 is accessible
|
| 138 |
+
|
| 139 |
+
### **Logs**
|
| 140 |
+
- Both services log to stdout
|
| 141 |
+
- Check Docker logs: `docker logs <container_id>`
|
| 142 |
+
|
| 143 |
+
## 🎯 **Features**
|
| 144 |
+
|
| 145 |
+
### **FastAPI Features**
|
| 146 |
+
- ✅ RESTful API endpoints
|
| 147 |
+
- ✅ Automatic API documentation
|
| 148 |
+
- ✅ Request/response validation
|
| 149 |
+
- ✅ Error handling
|
| 150 |
+
- ✅ CORS support
|
| 151 |
+
- ✅ Database integration
|
| 152 |
+
|
| 153 |
+
### **Gradio Features**
|
| 154 |
+
- ✅ Interactive web interface
|
| 155 |
+
- ✅ Real-time analysis
|
| 156 |
+
- ✅ Visual results display
|
| 157 |
+
- ✅ JSON export
|
| 158 |
+
- ✅ User-friendly controls
|
| 159 |
+
|
| 160 |
+
## 🔄 **Updates**
|
| 161 |
+
|
| 162 |
+
To update your deployment:
|
| 163 |
+
1. Make changes to your code
|
| 164 |
+
2. Commit and push to repository
|
| 165 |
+
3. Hugging Face Spaces will automatically rebuild
|
| 166 |
+
4. Both services will restart with new code
|
| 167 |
+
|
| 168 |
+
## 🆘 **Troubleshooting**
|
| 169 |
+
|
| 170 |
+
### **Common Issues**
|
| 171 |
+
1. **Port conflicts** - Ensure ports 8000 and 7860 are available
|
| 172 |
+
2. **Database connection** - Check Supabase credentials
|
| 173 |
+
3. **API key issues** - Verify Twitter API key is valid
|
| 174 |
+
4. **Model loading** - Check internet connection for model downloads
|
| 175 |
+
|
| 176 |
+
### **Getting Help**
|
| 177 |
+
- Check logs for error messages
|
| 178 |
+
- Verify environment variables
|
| 179 |
+
- Test individual components
|
| 180 |
+
- Check Hugging Face Spaces documentation
|
| 181 |
+
|
| 182 |
+
## 🎉 **Success!**
|
| 183 |
+
|
| 184 |
+
Once deployed, you'll have:
|
| 185 |
+
- **FastAPI** at `https://your-space.hf.space:8000`
|
| 186 |
+
- **Gradio** at `https://your-space.hf.space:7860`
|
| 187 |
+
- **API docs** at `https://your-space.hf.space:8000/docs`
|
| 188 |
+
|
| 189 |
+
Your Ocean Hazard Detection System is now live with both API and web interfaces! 🌊
|
Dockerfile
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dockerfile for Hugging Face Spaces with FastAPI + Gradio
|
| 2 |
+
|
| 3 |
+
# Use an official Python runtime as a parent image
|
| 4 |
+
FROM python:3.9-slim
|
| 5 |
+
|
| 6 |
+
# Set the working directory inside the container
|
| 7 |
+
WORKDIR /app
|
| 8 |
+
|
| 9 |
+
# Install system dependencies
|
| 10 |
+
RUN apt-get update && apt-get install -y \
|
| 11 |
+
gcc \
|
| 12 |
+
g++ \
|
| 13 |
+
curl \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
+
|
| 16 |
+
# Copy the requirements file and install dependencies first for better caching
|
| 17 |
+
COPY requirements.txt .
|
| 18 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 19 |
+
|
| 20 |
+
# Copy all your application code into the container
|
| 21 |
+
COPY . .
|
| 22 |
+
|
| 23 |
+
# IMPORTANT: Download and cache the models during the build process.
|
| 24 |
+
# This makes the application start much faster when the Space wakes up.
|
| 25 |
+
RUN python -c "from classifier import classify_with_model; classify_with_model('test')"
|
| 26 |
+
RUN python -c "from ner import get_ner_pipeline; get_ner_pipeline()"
|
| 27 |
+
RUN python -c "from sentiment import get_emotion_classifier; get_emotion_classifier()"
|
| 28 |
+
RUN python -c "from translate import get_translator; get_translator()"
|
| 29 |
+
|
| 30 |
+
# Create startup script
|
| 31 |
+
RUN echo '#!/bin/bash\n\
|
| 32 |
+
echo "🚀 Starting FastAPI server on port 8000..."\n\
|
| 33 |
+
python -m uvicorn api:app --host 0.0.0.0 --port 8000 &\n\
|
| 34 |
+
echo "🌊 Starting Gradio web interface on port 7860..."\n\
|
| 35 |
+
python app.py' > start_services.sh
|
| 36 |
+
|
| 37 |
+
RUN chmod +x start_services.sh
|
| 38 |
+
|
| 39 |
+
# Expose ports for both services
|
| 40 |
+
EXPOSE 7860 # Gradio web interface
|
| 41 |
+
EXPOSE 8000 # FastAPI
|
| 42 |
+
|
| 43 |
+
# Health check
|
| 44 |
+
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
| 45 |
+
CMD curl -f http://localhost:8000/health || exit 1
|
| 46 |
+
|
| 47 |
+
# Start both services
|
| 48 |
+
CMD ["./start_services.sh"]
|
README.md
CHANGED
|
@@ -1,12 +1,145 @@
|
|
| 1 |
---
|
| 2 |
-
title: Pravaah
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
-
short_description:
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Pravaah - Ocean Hazard Detection System
|
| 3 |
+
emoji: 🌊
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
license: mit
|
| 9 |
+
short_description: AI-powered system to detect ocean hazards with FastAPI + Gradio interface
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# 🌊 Ocean Hazard Detection System
|
| 13 |
+
|
| 14 |
+
An AI-powered system that analyzes social media posts to detect ocean-related hazards in real-time. This system uses advanced natural language processing to identify hazardous tweets, translate them to English, analyze sentiment, and extract location information.
|
| 15 |
+
|
| 16 |
+
## 🚀 Features
|
| 17 |
+
|
| 18 |
+
- **Multilingual Support**: Analyzes tweets in 20+ Indian languages including Hindi, Bengali, Tamil, Telugu, Marathi, Gujarati, and English
|
| 19 |
+
- **Hazard Classification**: Uses XLM-RoBERTa zero-shot classification to identify ocean hazards
|
| 20 |
+
- **Sentiment Analysis**: Analyzes emotional context using GoEmotions model
|
| 21 |
+
- **Named Entity Recognition**: Extracts hazard types and locations from text
|
| 22 |
+
- **Real-time Processing**: Processes tweets from Indian coastal regions
|
| 23 |
+
- **Database Storage**: Stores hazardous tweets for tracking and analysis
|
| 24 |
+
|
| 25 |
+
## 🔍 What It Detects
|
| 26 |
+
|
| 27 |
+
### Hazard Types
|
| 28 |
+
- Floods and tsunamis
|
| 29 |
+
- Cyclones and storm surges
|
| 30 |
+
- High tides and waves
|
| 31 |
+
- Coastal flooding and erosion
|
| 32 |
+
- Rip currents and marine debris
|
| 33 |
+
- Water discoloration and algal blooms
|
| 34 |
+
- Marine pollution
|
| 35 |
+
|
| 36 |
+
### Geographic Coverage
|
| 37 |
+
- **Major Cities**: Mumbai, Chennai, Kolkata, Vizag, Puri
|
| 38 |
+
- **States**: Odisha, Kerala, Gujarat, Goa, Andhra Pradesh, West Bengal
|
| 39 |
+
- **Water Bodies**: Bay of Bengal, Arabian Sea
|
| 40 |
+
|
| 41 |
+
## 🛠️ Technical Stack
|
| 42 |
+
|
| 43 |
+
- **AI Models**:
|
| 44 |
+
- XLM-RoBERTa for hazard classification
|
| 45 |
+
- Helsinki-NLP for translation
|
| 46 |
+
- GoEmotions for sentiment analysis
|
| 47 |
+
- Custom NER for location extraction
|
| 48 |
+
- **Backend**: FastAPI + Gradio
|
| 49 |
+
- **Database**: PostgreSQL
|
| 50 |
+
- **Languages**: Python 3.9+
|
| 51 |
+
|
| 52 |
+
## 📊 How It Works
|
| 53 |
+
|
| 54 |
+
1. **Tweet Collection**: Scrapes tweets using Twitter API with hazard and location keywords
|
| 55 |
+
2. **Hazard Classification**: Uses zero-shot learning to classify tweets as hazardous or safe
|
| 56 |
+
3. **Translation**: Translates non-English tweets to English for consistent processing
|
| 57 |
+
4. **Sentiment Analysis**: Analyzes emotional context (panic, calm, confusion, neutral)
|
| 58 |
+
5. **Entity Extraction**: Identifies specific hazard types and locations
|
| 59 |
+
6. **Database Storage**: Stores hazardous tweets with metadata for tracking
|
| 60 |
+
|
| 61 |
+
## 🚀 Usage
|
| 62 |
+
|
| 63 |
+
### Web Interface (Gradio)
|
| 64 |
+
1. **Set Tweet Limit**: Choose how many tweets to analyze (1-50)
|
| 65 |
+
2. **Click Analyze**: The system will process tweets and show results
|
| 66 |
+
3. **View Results**: See hazardous tweets with sentiment, location, and hazard type
|
| 67 |
+
4. **Export Data**: Download complete analysis as JSON
|
| 68 |
+
|
| 69 |
+
### API Endpoints (FastAPI)
|
| 70 |
+
|
| 71 |
+
#### **POST /analyze**
|
| 72 |
+
Analyze tweets for ocean hazards
|
| 73 |
+
```bash
|
| 74 |
+
curl -X POST "http://localhost:8000/analyze" \
|
| 75 |
+
-H "Content-Type: application/json" \
|
| 76 |
+
-d '{"limit": 20, "query": "flood OR tsunami"}'
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
#### **GET /hazardous-tweets**
|
| 80 |
+
Get stored hazardous tweets
|
| 81 |
+
```bash
|
| 82 |
+
curl "http://localhost:8000/hazardous-tweets?limit=50&offset=0"
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
#### **GET /stats**
|
| 86 |
+
Get analysis statistics
|
| 87 |
+
```bash
|
| 88 |
+
curl "http://localhost:8000/stats"
|
| 89 |
+
```
|
| 90 |
+
|
| 91 |
+
#### **GET /health**
|
| 92 |
+
Health check endpoint
|
| 93 |
+
```bash
|
| 94 |
+
curl "http://localhost:8000/health"
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
### API Documentation
|
| 98 |
+
- **Swagger UI**: `http://localhost:8000/docs`
|
| 99 |
+
- **ReDoc**: `http://localhost:8000/redoc`
|
| 100 |
+
|
| 101 |
+
## 🔧 Environment Variables
|
| 102 |
+
|
| 103 |
+
The system requires the following environment variables:
|
| 104 |
+
|
| 105 |
+
```bash
|
| 106 |
+
# Twitter API (required)
|
| 107 |
+
TWITTER_API_KEY=your_twitter_api_key
|
| 108 |
+
|
| 109 |
+
# PostgreSQL Database (optional for demo)
|
| 110 |
+
PGHOST=localhost
|
| 111 |
+
PGPORT=5432
|
| 112 |
+
PGDATABASE=postgres
|
| 113 |
+
PGUSER=postgres
|
| 114 |
+
PGPASSWORD=your_password
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
## 📈 Use Cases
|
| 118 |
+
|
| 119 |
+
- **Emergency Response**: Early detection of ocean hazards for rapid response
|
| 120 |
+
- **Environmental Monitoring**: Track marine pollution and coastal issues
|
| 121 |
+
- **Research**: Analyze public sentiment about ocean-related events
|
| 122 |
+
- **Policy Making**: Data-driven insights for coastal management policies
|
| 123 |
+
|
| 124 |
+
## 🔬 Model Details
|
| 125 |
+
|
| 126 |
+
- **Classification Model**: `joeddav/xlm-roberta-large-xnli`
|
| 127 |
+
- **Translation Model**: Helsinki-NLP OPUS-MT models
|
| 128 |
+
- **Sentiment Model**: Google GoEmotions
|
| 129 |
+
- **NER**: Custom keyword-based extraction with fallback
|
| 130 |
+
|
| 131 |
+
## 📝 License
|
| 132 |
+
|
| 133 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
| 134 |
+
|
| 135 |
+
## 🤝 Contributing
|
| 136 |
+
|
| 137 |
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
| 138 |
+
|
| 139 |
+
## 📞 Support
|
| 140 |
+
|
| 141 |
+
For support, please open an issue in the GitHub repository.
|
| 142 |
+
|
| 143 |
+
---
|
| 144 |
+
|
| 145 |
+
**Note**: This is a demonstration system. In production, it would process real-time tweets and integrate with emergency response systems.
|
api.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastapi import FastAPI, BackgroundTasks, HTTPException
|
| 2 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
import json
|
| 6 |
+
import logging
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from email.utils import parsedate_to_datetime
|
| 9 |
+
|
| 10 |
+
# Import our modules
|
| 11 |
+
from scraper import fetch_hazard_tweets
|
| 12 |
+
from classifier import classify_tweets
|
| 13 |
+
from pg_db import init_db, upsert_hazardous_tweet
|
| 14 |
+
|
| 15 |
+
# Set up logging
|
| 16 |
+
logging.basicConfig(level=logging.INFO)
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
# Initialize FastAPI app
|
| 20 |
+
app = FastAPI(
|
| 21 |
+
title="Ocean Hazard Detection API",
|
| 22 |
+
description="API for detecting ocean hazards from social media posts",
|
| 23 |
+
version="1.0.0"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# CORS middleware
|
| 27 |
+
app.add_middleware(
|
| 28 |
+
CORSMiddleware,
|
| 29 |
+
allow_origins=["*"], # Configure this properly for production
|
| 30 |
+
allow_credentials=True,
|
| 31 |
+
allow_methods=["*"],
|
| 32 |
+
allow_headers=["*"],
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# Initialize database
|
| 36 |
+
try:
|
| 37 |
+
init_db()
|
| 38 |
+
logger.info("Database initialized successfully")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.warning(f"Database initialization failed: {e}. API will work without database persistence.")
|
| 41 |
+
|
| 42 |
+
# Pydantic models
|
| 43 |
+
class TweetAnalysisRequest(BaseModel):
|
| 44 |
+
limit: int = 20
|
| 45 |
+
query: Optional[str] = None
|
| 46 |
+
|
| 47 |
+
class TweetAnalysisResponse(BaseModel):
|
| 48 |
+
total_tweets: int
|
| 49 |
+
hazardous_tweets: int
|
| 50 |
+
results: List[dict]
|
| 51 |
+
processing_time: float
|
| 52 |
+
|
| 53 |
+
class HealthResponse(BaseModel):
|
| 54 |
+
status: str
|
| 55 |
+
message: str
|
| 56 |
+
timestamp: str
|
| 57 |
+
|
| 58 |
+
# Health check endpoint
|
| 59 |
+
@app.get("/", response_model=HealthResponse)
|
| 60 |
+
def health_check():
|
| 61 |
+
"""Health check endpoint"""
|
| 62 |
+
return HealthResponse(
|
| 63 |
+
status="healthy",
|
| 64 |
+
message="Ocean Hazard Detection API is running",
|
| 65 |
+
timestamp=datetime.utcnow().isoformat()
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
+
@app.get("/health", response_model=HealthResponse)
|
| 69 |
+
def health():
|
| 70 |
+
"""Alternative health check endpoint"""
|
| 71 |
+
return health_check()
|
| 72 |
+
|
| 73 |
+
# Main analysis endpoint
|
| 74 |
+
@app.post("/analyze", response_model=TweetAnalysisResponse)
|
| 75 |
+
async def analyze_tweets(request: TweetAnalysisRequest):
|
| 76 |
+
"""
|
| 77 |
+
Analyze tweets for ocean hazards
|
| 78 |
+
|
| 79 |
+
- **limit**: Number of tweets to analyze (1-50)
|
| 80 |
+
- **query**: Custom search query (optional)
|
| 81 |
+
"""
|
| 82 |
+
start_time = datetime.utcnow()
|
| 83 |
+
|
| 84 |
+
try:
|
| 85 |
+
logger.info(f"Starting analysis with limit: {request.limit}")
|
| 86 |
+
|
| 87 |
+
# Fetch tweets
|
| 88 |
+
if request.query:
|
| 89 |
+
# Use custom query if provided
|
| 90 |
+
from scraper import search_tweets, extract_tweets
|
| 91 |
+
result = search_tweets(request.query, limit=request.limit)
|
| 92 |
+
tweets = extract_tweets(result)
|
| 93 |
+
else:
|
| 94 |
+
# Use default hazard query
|
| 95 |
+
tweets = fetch_hazard_tweets(limit=request.limit)
|
| 96 |
+
|
| 97 |
+
logger.info(f"Fetched {len(tweets)} tweets")
|
| 98 |
+
|
| 99 |
+
# Classify tweets
|
| 100 |
+
results = classify_tweets(tweets)
|
| 101 |
+
logger.info(f"Classified {len(results)} tweets")
|
| 102 |
+
|
| 103 |
+
# Store hazardous tweets in database
|
| 104 |
+
hazardous_count = 0
|
| 105 |
+
try:
|
| 106 |
+
for r in results:
|
| 107 |
+
if r.get('hazardous') == 1:
|
| 108 |
+
hazardous_count += 1
|
| 109 |
+
hazards = (r.get('ner') or {}).get('hazards') or []
|
| 110 |
+
hazard_type = ", ".join(hazards) if hazards else "unknown"
|
| 111 |
+
locs = (r.get('ner') or {}).get('locations') or []
|
| 112 |
+
if not locs and r.get('location'):
|
| 113 |
+
locs = [r['location']]
|
| 114 |
+
location = ", ".join(locs) if locs else "unknown"
|
| 115 |
+
sentiment = r.get('sentiment') or {"label": "unknown", "score": 0.0}
|
| 116 |
+
created_at = r.get('created_at') or ""
|
| 117 |
+
tweet_date = ""
|
| 118 |
+
tweet_time = ""
|
| 119 |
+
if created_at:
|
| 120 |
+
dt = None
|
| 121 |
+
try:
|
| 122 |
+
dt = parsedate_to_datetime(created_at)
|
| 123 |
+
except Exception:
|
| 124 |
+
dt = None
|
| 125 |
+
if dt is None and 'T' in created_at:
|
| 126 |
+
try:
|
| 127 |
+
iso = created_at.replace('Z', '+00:00')
|
| 128 |
+
dt = datetime.fromisoformat(iso)
|
| 129 |
+
except Exception:
|
| 130 |
+
dt = None
|
| 131 |
+
if dt is not None:
|
| 132 |
+
tweet_date = dt.date().isoformat()
|
| 133 |
+
tweet_time = dt.time().strftime('%H:%M:%S')
|
| 134 |
+
upsert_hazardous_tweet(
|
| 135 |
+
tweet_url=r.get('tweet_url') or "",
|
| 136 |
+
hazard_type=hazard_type,
|
| 137 |
+
location=location,
|
| 138 |
+
sentiment_label=sentiment.get('label', 'unknown'),
|
| 139 |
+
sentiment_score=float(sentiment.get('score', 0.0)),
|
| 140 |
+
tweet_date=tweet_date,
|
| 141 |
+
tweet_time=tweet_time,
|
| 142 |
+
)
|
| 143 |
+
logger.info(f"Stored {hazardous_count} hazardous tweets in database")
|
| 144 |
+
except Exception as db_error:
|
| 145 |
+
logger.warning(f"Database storage failed: {db_error}. Results will not be persisted.")
|
| 146 |
+
|
| 147 |
+
# Calculate processing time
|
| 148 |
+
processing_time = (datetime.utcnow() - start_time).total_seconds()
|
| 149 |
+
|
| 150 |
+
return TweetAnalysisResponse(
|
| 151 |
+
total_tweets=len(results),
|
| 152 |
+
hazardous_tweets=hazardous_count,
|
| 153 |
+
results=results,
|
| 154 |
+
processing_time=processing_time
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.error(f"Analysis failed: {str(e)}")
|
| 159 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 160 |
+
|
| 161 |
+
# Get stored hazardous tweets
|
| 162 |
+
@app.get("/hazardous-tweets")
|
| 163 |
+
async def get_hazardous_tweets(limit: int = 100, offset: int = 0):
|
| 164 |
+
"""
|
| 165 |
+
Get stored hazardous tweets from database
|
| 166 |
+
|
| 167 |
+
- **limit**: Maximum number of tweets to return (default: 100)
|
| 168 |
+
- **offset**: Number of tweets to skip (default: 0)
|
| 169 |
+
"""
|
| 170 |
+
try:
|
| 171 |
+
from pg_db import get_conn
|
| 172 |
+
|
| 173 |
+
with get_conn() as conn:
|
| 174 |
+
with conn.cursor() as cur:
|
| 175 |
+
cur.execute("""
|
| 176 |
+
SELECT tweet_url, hazard_type, location, sentiment_label,
|
| 177 |
+
sentiment_score, tweet_date, tweet_time, inserted_at
|
| 178 |
+
FROM hazardous_tweets
|
| 179 |
+
ORDER BY inserted_at DESC
|
| 180 |
+
LIMIT %s OFFSET %s
|
| 181 |
+
""", (limit, offset))
|
| 182 |
+
|
| 183 |
+
columns = [desc[0] for desc in cur.description]
|
| 184 |
+
results = [dict(zip(columns, row)) for row in cur.fetchall()]
|
| 185 |
+
|
| 186 |
+
return {
|
| 187 |
+
"tweets": results,
|
| 188 |
+
"count": len(results),
|
| 189 |
+
"limit": limit,
|
| 190 |
+
"offset": offset
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
except Exception as e:
|
| 194 |
+
logger.error(f"Failed to fetch hazardous tweets: {str(e)}")
|
| 195 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 196 |
+
|
| 197 |
+
# Get statistics
|
| 198 |
+
@app.get("/stats")
|
| 199 |
+
async def get_stats():
|
| 200 |
+
"""Get analysis statistics"""
|
| 201 |
+
try:
|
| 202 |
+
from pg_db import get_conn
|
| 203 |
+
|
| 204 |
+
with get_conn() as conn:
|
| 205 |
+
with conn.cursor() as cur:
|
| 206 |
+
# Total hazardous tweets
|
| 207 |
+
cur.execute("SELECT COUNT(*) FROM hazardous_tweets")
|
| 208 |
+
total_hazardous = cur.fetchone()[0]
|
| 209 |
+
|
| 210 |
+
# By hazard type
|
| 211 |
+
cur.execute("""
|
| 212 |
+
SELECT hazard_type, COUNT(*) as count
|
| 213 |
+
FROM hazardous_tweets
|
| 214 |
+
GROUP BY hazard_type
|
| 215 |
+
ORDER BY count DESC
|
| 216 |
+
""")
|
| 217 |
+
hazard_types = [{"type": row[0], "count": row[1]} for row in cur.fetchall()]
|
| 218 |
+
|
| 219 |
+
# By location
|
| 220 |
+
cur.execute("""
|
| 221 |
+
SELECT location, COUNT(*) as count
|
| 222 |
+
FROM hazardous_tweets
|
| 223 |
+
WHERE location != 'unknown'
|
| 224 |
+
GROUP BY location
|
| 225 |
+
ORDER BY count DESC
|
| 226 |
+
LIMIT 10
|
| 227 |
+
""")
|
| 228 |
+
locations = [{"location": row[0], "count": row[1]} for row in cur.fetchall()]
|
| 229 |
+
|
| 230 |
+
# By sentiment
|
| 231 |
+
cur.execute("""
|
| 232 |
+
SELECT sentiment_label, COUNT(*) as count
|
| 233 |
+
FROM hazardous_tweets
|
| 234 |
+
GROUP BY sentiment_label
|
| 235 |
+
ORDER BY count DESC
|
| 236 |
+
""")
|
| 237 |
+
sentiments = [{"sentiment": row[0], "count": row[1]} for row in cur.fetchall()]
|
| 238 |
+
|
| 239 |
+
return {
|
| 240 |
+
"total_hazardous_tweets": total_hazardous,
|
| 241 |
+
"hazard_types": hazard_types,
|
| 242 |
+
"top_locations": locations,
|
| 243 |
+
"sentiment_distribution": sentiments
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
except Exception as e:
|
| 247 |
+
logger.error(f"Failed to fetch statistics: {str(e)}")
|
| 248 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 249 |
+
|
| 250 |
+
if __name__ == "__main__":
|
| 251 |
+
import uvicorn
|
| 252 |
+
uvicorn.run(app, host="0.0.0.0", port=8000)
|
app.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import logging
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from email.utils import parsedate_to_datetime
|
| 7 |
+
|
| 8 |
+
# Set up logging
|
| 9 |
+
logging.basicConfig(level=logging.INFO)
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
from scraper import fetch_hazard_tweets
|
| 14 |
+
from classifier import classify_tweets
|
| 15 |
+
from pg_db import init_db, upsert_hazardous_tweet
|
| 16 |
+
# Initialize database (optional - will work without it)
|
| 17 |
+
try:
|
| 18 |
+
init_db()
|
| 19 |
+
logger.info("Database initialized successfully")
|
| 20 |
+
except Exception as e:
|
| 21 |
+
logger.warning(f"Database initialization failed: {e}. App will work without database persistence.")
|
| 22 |
+
except ImportError as e:
|
| 23 |
+
logger.error(f"Failed to import required modules: {e}")
|
| 24 |
+
raise
|
| 25 |
+
|
| 26 |
+
def run_pipeline(limit=20):
|
| 27 |
+
"""Run the hazard detection pipeline"""
|
| 28 |
+
try:
|
| 29 |
+
logger.info(f"Starting pipeline with limit: {limit}")
|
| 30 |
+
tweets = fetch_hazard_tweets(limit=limit)
|
| 31 |
+
logger.info(f"Fetched {len(tweets)} tweets")
|
| 32 |
+
|
| 33 |
+
results = classify_tweets(tweets)
|
| 34 |
+
logger.info(f"Classified {len(results)} tweets")
|
| 35 |
+
|
| 36 |
+
# Store hazardous tweets in database (optional)
|
| 37 |
+
try:
|
| 38 |
+
hazardous_count = 0
|
| 39 |
+
for r in results:
|
| 40 |
+
if r.get('hazardous') == 1:
|
| 41 |
+
hazardous_count += 1
|
| 42 |
+
hazards = (r.get('ner') or {}).get('hazards') or []
|
| 43 |
+
hazard_type = ", ".join(hazards) if hazards else "unknown"
|
| 44 |
+
locs = (r.get('ner') or {}).get('locations') or []
|
| 45 |
+
if not locs and r.get('location'):
|
| 46 |
+
locs = [r['location']]
|
| 47 |
+
location = ", ".join(locs) if locs else "unknown"
|
| 48 |
+
sentiment = r.get('sentiment') or {"label": "unknown", "score": 0.0}
|
| 49 |
+
created_at = r.get('created_at') or ""
|
| 50 |
+
tweet_date = ""
|
| 51 |
+
tweet_time = ""
|
| 52 |
+
if created_at:
|
| 53 |
+
dt = None
|
| 54 |
+
try:
|
| 55 |
+
dt = parsedate_to_datetime(created_at)
|
| 56 |
+
except Exception:
|
| 57 |
+
dt = None
|
| 58 |
+
if dt is None and 'T' in created_at:
|
| 59 |
+
try:
|
| 60 |
+
iso = created_at.replace('Z', '+00:00')
|
| 61 |
+
dt = datetime.fromisoformat(iso)
|
| 62 |
+
except Exception:
|
| 63 |
+
dt = None
|
| 64 |
+
if dt is not None:
|
| 65 |
+
tweet_date = dt.date().isoformat()
|
| 66 |
+
tweet_time = dt.time().strftime('%H:%M:%S')
|
| 67 |
+
upsert_hazardous_tweet(
|
| 68 |
+
tweet_url=r.get('tweet_url') or "",
|
| 69 |
+
hazard_type=hazard_type,
|
| 70 |
+
location=location,
|
| 71 |
+
sentiment_label=sentiment.get('label', 'unknown'),
|
| 72 |
+
sentiment_score=float(sentiment.get('score', 0.0)),
|
| 73 |
+
tweet_date=tweet_date,
|
| 74 |
+
tweet_time=tweet_time,
|
| 75 |
+
)
|
| 76 |
+
logger.info(f"Stored {hazardous_count} hazardous tweets in database")
|
| 77 |
+
except Exception as db_error:
|
| 78 |
+
logger.warning(f"Database storage failed: {db_error}. Results will not be persisted.")
|
| 79 |
+
|
| 80 |
+
return results
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.error(f"Pipeline failed: {str(e)}")
|
| 83 |
+
return f"Error: {str(e)}"
|
| 84 |
+
|
| 85 |
+
def analyze_tweets(limit):
|
| 86 |
+
"""Gradio interface function to analyze tweets"""
|
| 87 |
+
try:
|
| 88 |
+
limit = int(limit) if limit else 20
|
| 89 |
+
results = run_pipeline(limit=limit)
|
| 90 |
+
|
| 91 |
+
if isinstance(results, str): # Error case
|
| 92 |
+
return results, ""
|
| 93 |
+
|
| 94 |
+
# Count hazardous tweets
|
| 95 |
+
hazardous_count = sum(1 for r in results if r.get('hazardous') == 1)
|
| 96 |
+
total_count = len(results)
|
| 97 |
+
|
| 98 |
+
# Format results for display
|
| 99 |
+
display_text = f"Analyzed {total_count} tweets, found {hazardous_count} hazardous tweets.\n\n"
|
| 100 |
+
|
| 101 |
+
for i, result in enumerate(results, 1):
|
| 102 |
+
status = "🚨 HAZARDOUS" if result.get('hazardous') == 1 else "✅ Safe"
|
| 103 |
+
display_text += f"{i}. {status}\n"
|
| 104 |
+
display_text += f" Text: {result.get('text', 'N/A')[:100]}...\n"
|
| 105 |
+
if result.get('translated_text'):
|
| 106 |
+
display_text += f" Translated: {result.get('translated_text', 'N/A')[:100]}...\n"
|
| 107 |
+
if result.get('hazardous') == 1:
|
| 108 |
+
sentiment = result.get('sentiment', {})
|
| 109 |
+
display_text += f" Sentiment: {sentiment.get('label', 'unknown')} ({sentiment.get('score', 0):.2f})\n"
|
| 110 |
+
ner = result.get('ner', {})
|
| 111 |
+
if ner.get('hazards'):
|
| 112 |
+
display_text += f" Hazards: {', '.join(ner.get('hazards', []))}\n"
|
| 113 |
+
if ner.get('locations'):
|
| 114 |
+
display_text += f" Locations: {', '.join(ner.get('locations', []))}\n"
|
| 115 |
+
display_text += f" URL: {result.get('tweet_url', 'N/A')}\n\n"
|
| 116 |
+
|
| 117 |
+
# Create JSON output
|
| 118 |
+
json_output = json.dumps(results, indent=2, ensure_ascii=False)
|
| 119 |
+
|
| 120 |
+
return display_text, json_output
|
| 121 |
+
|
| 122 |
+
except Exception as e:
|
| 123 |
+
return f"Error: {str(e)}", ""
|
| 124 |
+
|
| 125 |
+
# Health check endpoint
|
| 126 |
+
def health_check():
|
| 127 |
+
"""Simple health check for Docker"""
|
| 128 |
+
return {"status": "healthy", "message": "Ocean Hazard Detection System is running"}
|
| 129 |
+
|
| 130 |
+
# Create Gradio interface
|
| 131 |
+
with gr.Blocks(title="Ocean Hazard Detection", theme=gr.themes.Soft()) as demo:
|
| 132 |
+
gr.Markdown("""
|
| 133 |
+
# 🌊 Ocean Hazard Detection System
|
| 134 |
+
|
| 135 |
+
This system analyzes tweets to detect ocean-related hazards using AI. It:
|
| 136 |
+
- Scrapes tweets about ocean hazards from Indian coastal regions
|
| 137 |
+
- Classifies tweets as hazardous or safe using multilingual AI
|
| 138 |
+
- Translates non-English tweets to English
|
| 139 |
+
- Analyzes sentiment and extracts hazard types and locations
|
| 140 |
+
- Stores hazardous tweets in a database for tracking
|
| 141 |
+
|
| 142 |
+
**Note**: This demo uses a limited dataset. In production, it would analyze real-time tweets.
|
| 143 |
+
""")
|
| 144 |
+
|
| 145 |
+
with gr.Row():
|
| 146 |
+
with gr.Column():
|
| 147 |
+
limit_input = gr.Number(
|
| 148 |
+
label="Number of tweets to analyze",
|
| 149 |
+
value=10,
|
| 150 |
+
minimum=1,
|
| 151 |
+
maximum=50,
|
| 152 |
+
step=1
|
| 153 |
+
)
|
| 154 |
+
analyze_btn = gr.Button("🔍 Analyze Tweets", variant="primary")
|
| 155 |
+
|
| 156 |
+
with gr.Column():
|
| 157 |
+
gr.Markdown("### 📊 Analysis Results")
|
| 158 |
+
results_text = gr.Textbox(
|
| 159 |
+
label="Analysis Summary",
|
| 160 |
+
lines=15,
|
| 161 |
+
max_lines=20,
|
| 162 |
+
interactive=False
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
with gr.Row():
|
| 166 |
+
gr.Markdown("### 📄 Raw JSON Output")
|
| 167 |
+
json_output = gr.Textbox(
|
| 168 |
+
label="Complete Analysis Data (JSON)",
|
| 169 |
+
lines=10,
|
| 170 |
+
max_lines=15,
|
| 171 |
+
interactive=False
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
# Event handlers
|
| 175 |
+
analyze_btn.click(
|
| 176 |
+
fn=analyze_tweets,
|
| 177 |
+
inputs=[limit_input],
|
| 178 |
+
outputs=[results_text, json_output]
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
# Add some example queries
|
| 182 |
+
gr.Markdown("""
|
| 183 |
+
### 🔍 What this system looks for:
|
| 184 |
+
- **Hazard Keywords**: flood, tsunami, cyclone, storm surge, high tide, high waves, swell, coastal flooding, rip current, coastal erosion, water discoloration, algal bloom, marine debris, pollution
|
| 185 |
+
- **Locations**: Mumbai, Chennai, Kolkata, Odisha, Kerala, Gujarat, Goa, Andhra Pradesh, West Bengal, Vizag, Puri, Bay of Bengal, Arabian Sea
|
| 186 |
+
- **Languages**: Supports 20+ Indian languages including Hindi, Bengali, Tamil, Telugu, Marathi, Gujarati, and English
|
| 187 |
+
""")
|
| 188 |
+
|
| 189 |
+
if __name__ == "__main__":
|
| 190 |
+
# Add health check route
|
| 191 |
+
demo.launch(
|
| 192 |
+
server_name="0.0.0.0", # Important for Docker
|
| 193 |
+
server_port=7860, # Gradio default port
|
| 194 |
+
show_error=True, # Show errors in the interface
|
| 195 |
+
share=False, # Don't create public link
|
| 196 |
+
debug=True # Enable debug mode
|
| 197 |
+
)
|
classifier.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
from scraper import fetch_hazard_tweets
|
| 3 |
+
from translate import translate_to_english
|
| 4 |
+
from sentiment import classify_emotion_text
|
| 5 |
+
from ner import extract_hazard_and_locations
|
| 6 |
+
import json
|
| 7 |
+
|
| 8 |
+
model_name = "joeddav/xlm-roberta-large-xnli"
|
| 9 |
+
|
| 10 |
+
classifier = pipeline("zero-shot-classification", model=model_name,framework="pt")
|
| 11 |
+
|
| 12 |
+
def classify_with_model(tweet_text):
|
| 13 |
+
"""
|
| 14 |
+
Classifies a tweet using a MULTILINGUAL zero-shot learning model.
|
| 15 |
+
Returns 1 if hazardous, else 0.
|
| 16 |
+
"""
|
| 17 |
+
if not tweet_text or not tweet_text.strip():
|
| 18 |
+
return 0
|
| 19 |
+
candidate_labels = ["report of an ocean hazard", "not an ocean hazard"]
|
| 20 |
+
result = classifier(tweet_text, candidate_labels)
|
| 21 |
+
top_label = result['labels'][0]
|
| 22 |
+
top_score = result['scores'][0]
|
| 23 |
+
if top_label == "report of an ocean hazard" and top_score > 0.75:
|
| 24 |
+
return 1
|
| 25 |
+
return 0
|
| 26 |
+
|
| 27 |
+
def classify_tweets(tweets):
|
| 28 |
+
"""
|
| 29 |
+
Accepts list of tweet dicts with 'text' field.
|
| 30 |
+
Pipeline: classify hazard -> if hazardous, translate -> sentiment -> NER.
|
| 31 |
+
Returns enriched dicts.
|
| 32 |
+
"""
|
| 33 |
+
classified = []
|
| 34 |
+
for t in tweets:
|
| 35 |
+
text = t.get('text', '')
|
| 36 |
+
hazardous = classify_with_model(text)
|
| 37 |
+
item = dict(t)
|
| 38 |
+
item['hazardous'] = hazardous
|
| 39 |
+
translated = translate_to_english(text)
|
| 40 |
+
item['translated_text'] = translated
|
| 41 |
+
if hazardous == 1:
|
| 42 |
+
sentiment = classify_emotion_text(translated)
|
| 43 |
+
item['sentiment'] = sentiment
|
| 44 |
+
ner_info = extract_hazard_and_locations(translated)
|
| 45 |
+
item['ner'] = ner_info
|
| 46 |
+
classified.append(item)
|
| 47 |
+
return classified
|
| 48 |
+
|
| 49 |
+
if __name__ == "__main__":
|
| 50 |
+
tweets = fetch_hazard_tweets(limit=20)
|
| 51 |
+
classified = classify_tweets(tweets)
|
| 52 |
+
print(json.dumps(classified, indent=2, ensure_ascii=False))
|
ner.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
|
| 3 |
+
_ner_pipeline = None
|
| 4 |
+
|
| 5 |
+
def get_ner_pipeline():
|
| 6 |
+
"""
|
| 7 |
+
Lazily load and return NER pipeline for multilingual location extraction.
|
| 8 |
+
"""
|
| 9 |
+
global _ner_pipeline
|
| 10 |
+
if _ner_pipeline is not None:
|
| 11 |
+
return _ner_pipeline
|
| 12 |
+
# Use a lighter multilingual model to avoid OOM on constrained machines
|
| 13 |
+
ner_model_name = "Davlan/bert-base-multilingual-cased-ner-hrl"
|
| 14 |
+
try:
|
| 15 |
+
_ner_pipeline = pipeline("ner", model=ner_model_name, aggregation_strategy="simple")
|
| 16 |
+
except Exception:
|
| 17 |
+
# Return None to allow regex/location keyword fallback downstream
|
| 18 |
+
_ner_pipeline = None
|
| 19 |
+
return _ner_pipeline
|
| 20 |
+
|
| 21 |
+
def extract_hazard_and_locations(text):
|
| 22 |
+
"""
|
| 23 |
+
Extract hazard keywords and locations from a single text.
|
| 24 |
+
Returns dict: {hazards: [..], locations: [..]}
|
| 25 |
+
"""
|
| 26 |
+
if not text or not text.strip():
|
| 27 |
+
return {"hazards": [], "locations": []}
|
| 28 |
+
|
| 29 |
+
hazard_keywords = [
|
| 30 |
+
'Tsunami', 'High Waves', 'Coastal Flooding', 'Storm Surge',
|
| 31 |
+
'Rip Current', 'Coastal Erosion', 'Algal Bloom',
|
| 32 |
+
'Marine Pollution', 'Cyclone', 'flood'
|
| 33 |
+
]
|
| 34 |
+
detected_hazards = []
|
| 35 |
+
text_lower = text.lower()
|
| 36 |
+
for hazard in hazard_keywords:
|
| 37 |
+
if hazard.lower() in text_lower:
|
| 38 |
+
detected_hazards.append(hazard)
|
| 39 |
+
|
| 40 |
+
ner = get_ner_pipeline()
|
| 41 |
+
locations = []
|
| 42 |
+
if ner is not None:
|
| 43 |
+
try:
|
| 44 |
+
ner_results = ner(text)
|
| 45 |
+
locations = [entity['word'] for entity in ner_results if entity.get('entity_group') == 'LOC']
|
| 46 |
+
except Exception:
|
| 47 |
+
locations = []
|
| 48 |
+
# Fallback: simple keyword-based location spotting if NER unavailable
|
| 49 |
+
if not locations:
|
| 50 |
+
location_keywords = [
|
| 51 |
+
"Mumbai","Chennai","Kolkata","Odisha","Kerala","Gujarat","Goa",
|
| 52 |
+
"Andhra Pradesh","West Bengal","Vizag","Visakhapatnam","Puri",
|
| 53 |
+
"Bay of Bengal","Arabian Sea","Tamil Nadu","Maharashtra","Karnataka",
|
| 54 |
+
"Andaman","Nicobar","Lakshadweep","Kochi","Cochin","Mangaluru","Mangalore",
|
| 55 |
+
"Chandipur","Paradip","Digha","Gopalpur"
|
| 56 |
+
]
|
| 57 |
+
text_lower = text.lower()
|
| 58 |
+
for name in location_keywords:
|
| 59 |
+
if name.lower() in text_lower:
|
| 60 |
+
locations.append(name)
|
| 61 |
+
|
| 62 |
+
return {"hazards": detected_hazards, "locations": locations}
|
| 63 |
+
|
| 64 |
+
# Removed hard-coded demo runner; this module now only provides reusable functions.
|
| 65 |
+
"""
|
| 66 |
+
Loads a Named Entity Recognition (NER) model to find locations and then
|
| 67 |
+
searches the text for specific hazard-related keywords.
|
| 68 |
+
"""
|
| 69 |
+
# --- 1. Load the NER Model for Location Extraction ---
|
| 70 |
+
# UPDATED: Using the large, high-accuracy model as requested.
|
| 71 |
+
ner_model_name = "Davlan/xlm-roberta-large-ner-hrl"
|
| 72 |
+
print(f"Loading NER model: '{ner_model_name}'...")
|
| 73 |
+
try:
|
| 74 |
+
ner_pipeline = pipeline("ner", model=ner_model_name, aggregation_strategy="simple")
|
| 75 |
+
print("NER model loaded successfully!")
|
| 76 |
+
except Exception as e:
|
| 77 |
+
print(f"Failed to load NER model. Error: {e}")
|
| 78 |
+
return
|
| 79 |
+
|
| 80 |
+
# --- 2. Define the Hazard Keywords to search for ---
|
| 81 |
+
# These are the exact phrases we will look for in the text.
|
| 82 |
+
hazard_keywords = [
|
| 83 |
+
'Tsunami', 'High Waves', 'Coastal Flooding', 'Storm Surge',
|
| 84 |
+
'Rip Current', 'Coastal Erosion', 'Algal Bloom',
|
| 85 |
+
'Marine Pollution', 'Cyclone', 'flood' # Added "flood" as a common variation
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
# --- 3. Prepare Example Tweets for Analysis ---
|
| 89 |
+
tweets_to_analyze = [
|
| 90 |
+
"Major coastal flooding reported in Chennai due to the storm surge. All residents advised to stay indoors.",
|
| 91 |
+
"Authorities have issued a tsunami warning for the entire Odisha coastline after the earthquake.",
|
| 92 |
+
"The recent cyclone has caused severe coastal erosion near Puri beach.",
|
| 93 |
+
"मुंबई में ऊंची लहरों की चेतावनी है, कृपया समुद्र तट से दूर रहें।", # Hindi: "Warning of high waves in Mumbai, please stay away from the beach."
|
| 94 |
+
"Not a hazard: The sunset over the calm sea in Goa was beautiful today."
|
| 95 |
+
]
|
| 96 |
+
|
| 97 |
+
print("\n--- Analyzing Tweets for Hazards and Locations ---")
|
| 98 |
+
|
| 99 |
+
for tweet in tweets_to_analyze:
|
| 100 |
+
try:
|
| 101 |
+
# --- Step 1: Extract Locations using the NER model ---
|
| 102 |
+
ner_results = ner_pipeline(tweet)
|
| 103 |
+
# Filter the results to get only the words identified as locations ('LOC').
|
| 104 |
+
locations = [entity['word'] for entity in ner_results if entity['entity_group'] == 'LOC']
|
| 105 |
+
|
| 106 |
+
# --- Step 2: Extract Hazard Keywords directly from the text ---
|
| 107 |
+
detected_hazards = []
|
| 108 |
+
tweet_lower = tweet.lower()
|
| 109 |
+
for hazard in hazard_keywords:
|
| 110 |
+
# Check if the hazard keyword exists in the tweet (case-insensitive)
|
| 111 |
+
if hazard.lower() in tweet_lower:
|
| 112 |
+
detected_hazards.append(hazard)
|
| 113 |
+
|
| 114 |
+
# --- Print the structured results ---
|
| 115 |
+
print(f"Text: '{tweet}'")
|
| 116 |
+
print(f" -> Location(s): {locations if locations else 'None Detected'}")
|
| 117 |
+
print(f" -> Detected Hazard(s): {detected_hazards if detected_hazards else 'None Detected'}")
|
| 118 |
+
print("-" * 25)
|
| 119 |
+
|
| 120 |
+
except Exception as e:
|
| 121 |
+
print(f"Could not process tweet: '{tweet}'. Error: {e}")
|
| 122 |
+
|
| 123 |
+
if __name__ == "__main__":
|
| 124 |
+
extract_hazard_info()
|
| 125 |
+
|
pg_db.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from contextlib import contextmanager
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
import psycopg2
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def _load_env_file(path: str = ".env"):
|
| 9 |
+
if not os.path.isfile(path):
|
| 10 |
+
return
|
| 11 |
+
try:
|
| 12 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 13 |
+
for line in f:
|
| 14 |
+
line = line.strip()
|
| 15 |
+
if not line or line.startswith("#"):
|
| 16 |
+
continue
|
| 17 |
+
if "=" in line:
|
| 18 |
+
key, value = line.split("=", 1)
|
| 19 |
+
key = key.strip()
|
| 20 |
+
value = value.strip().strip('"').strip("'")
|
| 21 |
+
if key and key not in os.environ:
|
| 22 |
+
os.environ[key] = value
|
| 23 |
+
except Exception:
|
| 24 |
+
pass
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _conn_params():
|
| 28 |
+
# Load .env into environment if present
|
| 29 |
+
_load_env_file()
|
| 30 |
+
|
| 31 |
+
# Check if we have Supabase URL (preferred method)
|
| 32 |
+
supabase_url = os.getenv('SUPABASE_URL')
|
| 33 |
+
if supabase_url:
|
| 34 |
+
# Extract connection details from Supabase URL
|
| 35 |
+
# Format: postgresql://postgres:[password]@[host]:[port]/postgres
|
| 36 |
+
import urllib.parse
|
| 37 |
+
parsed = urllib.parse.urlparse(supabase_url)
|
| 38 |
+
|
| 39 |
+
return dict(
|
| 40 |
+
host=parsed.hostname,
|
| 41 |
+
port=parsed.port or 5432,
|
| 42 |
+
dbname=parsed.path[1:] if parsed.path else 'postgres',
|
| 43 |
+
user=parsed.username or 'postgres',
|
| 44 |
+
password=parsed.password,
|
| 45 |
+
sslmode='require' # Supabase requires SSL
|
| 46 |
+
)
|
| 47 |
+
else:
|
| 48 |
+
# Fallback to individual environment variables
|
| 49 |
+
return dict(
|
| 50 |
+
host=os.getenv("PGHOST", "localhost"),
|
| 51 |
+
port=int(os.getenv("PGPORT", "5432")),
|
| 52 |
+
dbname=os.getenv("PGDATABASE", "postgres"),
|
| 53 |
+
user=os.getenv("PGUSER", "postgres"),
|
| 54 |
+
password=os.getenv("PGPASSWORD", ""),
|
| 55 |
+
sslmode='require' if os.getenv('PGHOST') and 'supabase' in os.getenv('PGHOST', '') else 'prefer'
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@contextmanager
|
| 60 |
+
def get_conn():
|
| 61 |
+
conn = psycopg2.connect(**_conn_params())
|
| 62 |
+
try:
|
| 63 |
+
yield conn
|
| 64 |
+
conn.commit()
|
| 65 |
+
finally:
|
| 66 |
+
conn.close()
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def init_db():
|
| 70 |
+
create_sql = """
|
| 71 |
+
CREATE TABLE IF NOT EXISTS hazardous_tweets (
|
| 72 |
+
id SERIAL PRIMARY KEY,
|
| 73 |
+
tweet_url TEXT UNIQUE,
|
| 74 |
+
hazard_type TEXT,
|
| 75 |
+
location TEXT,
|
| 76 |
+
sentiment_label TEXT,
|
| 77 |
+
sentiment_score DOUBLE PRECISION,
|
| 78 |
+
tweet_date DATE,
|
| 79 |
+
tweet_time TIME,
|
| 80 |
+
inserted_at TIMESTAMPTZ DEFAULT NOW()
|
| 81 |
+
);
|
| 82 |
+
"""
|
| 83 |
+
try:
|
| 84 |
+
with get_conn() as conn:
|
| 85 |
+
with conn.cursor() as cur:
|
| 86 |
+
cur.execute(create_sql)
|
| 87 |
+
print("✅ Database table initialized successfully")
|
| 88 |
+
except Exception as e:
|
| 89 |
+
print(f"❌ Error initializing database: {e}")
|
| 90 |
+
print("💡 Try running: python fix_database.py")
|
| 91 |
+
raise
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def upsert_hazardous_tweet(
|
| 95 |
+
*,
|
| 96 |
+
tweet_url: str,
|
| 97 |
+
hazard_type: str,
|
| 98 |
+
location: str,
|
| 99 |
+
sentiment_label: str,
|
| 100 |
+
sentiment_score: float,
|
| 101 |
+
tweet_date: str,
|
| 102 |
+
tweet_time: str,
|
| 103 |
+
):
|
| 104 |
+
"""
|
| 105 |
+
Insert if new; ignore duplicates based on tweet_url.
|
| 106 |
+
"""
|
| 107 |
+
insert_sql = """
|
| 108 |
+
INSERT INTO hazardous_tweets (
|
| 109 |
+
tweet_url, hazard_type, location, sentiment_label, sentiment_score,
|
| 110 |
+
tweet_date, tweet_time, inserted_at
|
| 111 |
+
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
| 112 |
+
ON CONFLICT (tweet_url) DO NOTHING;
|
| 113 |
+
"""
|
| 114 |
+
# Convert date/time strings to PostgreSQL-friendly formats
|
| 115 |
+
date_val = tweet_date if tweet_date else None
|
| 116 |
+
time_val = tweet_time if tweet_time else None
|
| 117 |
+
with get_conn() as conn:
|
| 118 |
+
with conn.cursor() as cur:
|
| 119 |
+
cur.execute(
|
| 120 |
+
insert_sql,
|
| 121 |
+
(
|
| 122 |
+
tweet_url,
|
| 123 |
+
hazard_type,
|
| 124 |
+
location,
|
| 125 |
+
sentiment_label,
|
| 126 |
+
float(sentiment_score),
|
| 127 |
+
date_val,
|
| 128 |
+
time_val,
|
| 129 |
+
datetime.utcnow().isoformat(timespec="seconds") + "Z",
|
| 130 |
+
),
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core ML/NLP dependencies
|
| 2 |
+
transformers>=4.30.0
|
| 3 |
+
torch>=2.0.0
|
| 4 |
+
tokenizers>=0.13.0
|
| 5 |
+
sentencepiece>=0.1.99
|
| 6 |
+
protobuf>=3.20.0
|
| 7 |
+
|
| 8 |
+
# Database connectivity
|
| 9 |
+
psycopg2-binary>=2.9.0
|
| 10 |
+
|
| 11 |
+
# HTTP requests and API calls
|
| 12 |
+
requests>=2.28.0
|
| 13 |
+
fastapi>=0.100.0
|
| 14 |
+
uvicorn>=0.20.0
|
| 15 |
+
|
| 16 |
+
# Environment variable management
|
| 17 |
+
python-dotenv>=1.0.0
|
| 18 |
+
|
| 19 |
+
# Gradio for Hugging Face Spaces
|
| 20 |
+
gradio>=3.40.0
|
| 21 |
+
|
| 22 |
+
# Additional dependencies for specific models
|
| 23 |
+
# Required for XLM-RoBERTa and multilingual models
|
| 24 |
+
sacrebleu>=2.3.0
|
| 25 |
+
sacremoses>=0.0.53
|
| 26 |
+
|
| 27 |
+
# Additional utilities
|
| 28 |
+
numpy>=1.24.0
|
| 29 |
+
pandas>=2.0.0
|
scraper.py
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
import json
|
| 3 |
+
from datetime import date, timedelta
|
| 4 |
+
|
| 5 |
+
from dotenv import load_dotenv
|
| 6 |
+
import os
|
| 7 |
+
|
| 8 |
+
# Load values from .env into environment
|
| 9 |
+
load_dotenv()
|
| 10 |
+
|
| 11 |
+
# Access the API key
|
| 12 |
+
API_KEY = os.getenv("TWITTER_API_KEY")
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def search_tweets(query, query_type="Latest", limit=20):
|
| 16 |
+
"""
|
| 17 |
+
Searches for tweets using the twitterapi.io advanced search endpoint.
|
| 18 |
+
"""
|
| 19 |
+
url = "https://api.twitterapi.io/twitter/tweet/advanced_search"
|
| 20 |
+
headers = {"X-API-Key": API_KEY}
|
| 21 |
+
params = {"query": query, "queryType": query_type, "limit": limit}
|
| 22 |
+
|
| 23 |
+
print(f"🔍 Executing search with query: {query}")
|
| 24 |
+
response = requests.get(url, headers=headers, params=params)
|
| 25 |
+
|
| 26 |
+
if response.status_code == 200:
|
| 27 |
+
return response.json()
|
| 28 |
+
else:
|
| 29 |
+
print(f"Error: {response.status_code}")
|
| 30 |
+
print(response.text)
|
| 31 |
+
return None
|
| 32 |
+
|
| 33 |
+
def extract_tweets(result_json):
|
| 34 |
+
"""
|
| 35 |
+
Extracts a normalized list of tweets from the API result.
|
| 36 |
+
Returns a list of dicts with keys: tweet_url, location, created_at, text, hashtags
|
| 37 |
+
"""
|
| 38 |
+
if not result_json or 'tweets' not in result_json:
|
| 39 |
+
return []
|
| 40 |
+
tweets = result_json.get('tweets', [])
|
| 41 |
+
extracted_data = []
|
| 42 |
+
for tweet in tweets:
|
| 43 |
+
tweet_url = tweet.get('url')
|
| 44 |
+
text = tweet.get('text')
|
| 45 |
+
created_at = tweet.get('createdAt')
|
| 46 |
+
location = tweet.get('author', {}).get('location', None)
|
| 47 |
+
hashtags = [tag['text'] for tag in tweet.get('entities', {}).get('hashtags', [])]
|
| 48 |
+
extracted_data.append({
|
| 49 |
+
'tweet_url': tweet_url,
|
| 50 |
+
'location': location,
|
| 51 |
+
'created_at': created_at,
|
| 52 |
+
'text': text,
|
| 53 |
+
'hashtags': hashtags
|
| 54 |
+
})
|
| 55 |
+
return extracted_data
|
| 56 |
+
|
| 57 |
+
def build_default_query():
|
| 58 |
+
"""
|
| 59 |
+
Builds the default hazard + India coastal locations + language + date query.
|
| 60 |
+
"""
|
| 61 |
+
hazard_keywords = (
|
| 62 |
+
"(flood OR tsunami OR cyclone OR \"storm surge\" OR \"high tide\" OR \"high waves\" OR swell OR "
|
| 63 |
+
"\"coastal flooding\" OR \"rip current\" OR \"coastal erosion\" OR \"water discoloration\" OR "
|
| 64 |
+
"\"algal bloom\" OR \"marine debris\" OR pollution)"
|
| 65 |
+
)
|
| 66 |
+
location_keywords = (
|
| 67 |
+
"(Mumbai OR Chennai OR Kolkata OR Odisha OR Kerala OR Gujarat OR Goa OR \"Andhra Pradesh\" "
|
| 68 |
+
"OR \"West Bengal\" OR Vizag OR Puri OR \"Bay of Bengal\" OR \"Arabian Sea\")"
|
| 69 |
+
)
|
| 70 |
+
allowed_languages = [
|
| 71 |
+
"as", "bn", "brx", "doi", "gu", "hi", "kn", "ks", "kok", "ml", "mni",
|
| 72 |
+
"mr", "ne", "or", "pa", "sa", "sat", "sd", "ta", "te", "ur", "en", "bh", "en"
|
| 73 |
+
]
|
| 74 |
+
lang_query = "(" + " OR ".join([f"lang:{lang}" for lang in allowed_languages]) + ")"
|
| 75 |
+
yesterday = date.today() - timedelta(days=1)
|
| 76 |
+
date_filter = f"since:{yesterday.strftime('%Y-%m-%d')}"
|
| 77 |
+
full_query = f"{hazard_keywords} {location_keywords} {lang_query} {date_filter}"
|
| 78 |
+
return full_query
|
| 79 |
+
|
| 80 |
+
def fetch_hazard_tweets(limit=20):
|
| 81 |
+
"""
|
| 82 |
+
Fetches tweets matching the default hazard query and returns extracted list.
|
| 83 |
+
"""
|
| 84 |
+
query = build_default_query()
|
| 85 |
+
result = search_tweets(query=query, query_type="Latest", limit=limit)
|
| 86 |
+
return extract_tweets(result)
|
| 87 |
+
|
| 88 |
+
if __name__ == "__main__":
|
| 89 |
+
tweets = fetch_hazard_tweets(limit=20)
|
| 90 |
+
if tweets:
|
| 91 |
+
print("\nExtracted tweets:")
|
| 92 |
+
print(json.dumps(tweets, indent=2, ensure_ascii=False))
|
sentiment.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
|
| 3 |
+
_emotion_classifier = None
|
| 4 |
+
|
| 5 |
+
def get_emotion_classifier():
|
| 6 |
+
"""
|
| 7 |
+
Load (lazily) and return a text-classification pipeline for emotions.
|
| 8 |
+
Using GoEmotions for strong multilingual-ish coverage via RoBERTa base.
|
| 9 |
+
"""
|
| 10 |
+
global _emotion_classifier
|
| 11 |
+
if _emotion_classifier is not None:
|
| 12 |
+
return _emotion_classifier
|
| 13 |
+
model_name = "SamLowe/roberta-base-go_emotions"
|
| 14 |
+
_emotion_classifier = pipeline("text-classification", model=model_name, framework="pt")
|
| 15 |
+
return _emotion_classifier
|
| 16 |
+
|
| 17 |
+
def classify_emotion_text(text):
|
| 18 |
+
"""
|
| 19 |
+
Classify a single text into one of: panic | calm | confusion | neutral | unknown
|
| 20 |
+
Returns dict: {label, score}
|
| 21 |
+
"""
|
| 22 |
+
if not text or not text.strip():
|
| 23 |
+
return {"label": "unknown", "score": 0.0}
|
| 24 |
+
|
| 25 |
+
emotion_to_category = {
|
| 26 |
+
'fear': 'panic', 'nervousness': 'panic', 'remorse': 'panic',
|
| 27 |
+
'joy': 'calm', 'love': 'calm', 'admiration': 'calm', 'approval': 'calm',
|
| 28 |
+
'caring': 'calm', 'excitement': 'calm', 'gratitude': 'calm', 'optimism': 'calm',
|
| 29 |
+
'relief': 'calm', 'pride': 'calm',
|
| 30 |
+
'confusion': 'confusion', 'curiosity': 'confusion', 'realization': 'confusion',
|
| 31 |
+
'neutral': 'neutral',
|
| 32 |
+
'anger': 'unknown', 'annoyance': 'unknown', 'disappointment': 'unknown',
|
| 33 |
+
'disapproval': 'unknown', 'disgust': 'unknown', 'embarrassment': 'unknown',
|
| 34 |
+
'grief': 'unknown', 'sadness': 'unknown', 'surprise': 'unknown', 'desire': 'unknown'
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
classifier = get_emotion_classifier()
|
| 38 |
+
try:
|
| 39 |
+
result = classifier(text)
|
| 40 |
+
top_label = result[0]['label']
|
| 41 |
+
top_score = float(result[0]['score'])
|
| 42 |
+
except Exception:
|
| 43 |
+
return {"label": "unknown", "score": 0.0}
|
| 44 |
+
|
| 45 |
+
mapped = emotion_to_category.get(top_label, 'unknown')
|
| 46 |
+
return {"label": mapped, "score": top_score}
|
| 47 |
+
|
| 48 |
+
if __name__ == "__main__":
|
| 49 |
+
# Simple demo
|
| 50 |
+
examples = [
|
| 51 |
+
"Cyclone warning issued; please evacuate immediately.",
|
| 52 |
+
"Beautiful calm sea today.",
|
| 53 |
+
"Why is the alert not clear?",
|
| 54 |
+
"Meeting at 3 PM.",
|
| 55 |
+
]
|
| 56 |
+
clf = get_emotion_classifier()
|
| 57 |
+
for ex in examples:
|
| 58 |
+
print(ex, "->", classify_emotion_text(ex))
|
| 59 |
+
|
translate.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
|
| 3 |
+
_translator = None
|
| 4 |
+
|
| 5 |
+
def get_translator():
|
| 6 |
+
"""
|
| 7 |
+
Lazily load and return a multilingual→English translation pipeline.
|
| 8 |
+
"""
|
| 9 |
+
global _translator
|
| 10 |
+
if _translator is not None:
|
| 11 |
+
return _translator
|
| 12 |
+
model_name = "Helsinki-NLP/opus-mt-mul-en"
|
| 13 |
+
_translator = pipeline("translation", model=model_name)
|
| 14 |
+
return _translator
|
| 15 |
+
|
| 16 |
+
def translate_to_english(text):
|
| 17 |
+
"""
|
| 18 |
+
Translate a single text to English. Returns the translated string.
|
| 19 |
+
If text is empty, returns the original text.
|
| 20 |
+
"""
|
| 21 |
+
if not text or not text.strip():
|
| 22 |
+
return text
|
| 23 |
+
translator = get_translator()
|
| 24 |
+
try:
|
| 25 |
+
return translator(text)[0]['translation_text']
|
| 26 |
+
except Exception:
|
| 27 |
+
return text
|
| 28 |
+
|
| 29 |
+
def translate_indian_languages():
|
| 30 |
+
"""
|
| 31 |
+
Loads a highly reliable multilingual model to translate text from various
|
| 32 |
+
languages into English.
|
| 33 |
+
"""
|
| 34 |
+
# This model is from the Helsinki-NLP group, the standard for translation tasks.
|
| 35 |
+
# It handles multiple source languages automatically without needing special tags.
|
| 36 |
+
print(f"Loading translation model for demo...")
|
| 37 |
+
try:
|
| 38 |
+
translator = get_translator()
|
| 39 |
+
print("Model loaded successfully!")
|
| 40 |
+
except Exception as e:
|
| 41 |
+
print(f"Failed to load the model. Please check your internet connection and library installation. Error: {e}")
|
| 42 |
+
return
|
| 43 |
+
|
| 44 |
+
# --- Prepare a list of example sentences to translate ---
|
| 45 |
+
sentences_to_translate = [
|
| 46 |
+
"चेतावनी! चक्रवात तट के करीब आ रहा है, तुरंत खाली करें!", # Hindi
|
| 47 |
+
"আজ আবহাওয়া খুব মনোরম।", # Bengali
|
| 48 |
+
"మీరు ఎలా ఉన్నారు?", # Telugu
|
| 49 |
+
"The meeting is scheduled for 3 PM tomorrow.", # English (will be handled gracefully)
|
| 50 |
+
]
|
| 51 |
+
|
| 52 |
+
print("\n--- Translating Sentences ---")
|
| 53 |
+
|
| 54 |
+
for sentence in sentences_to_translate:
|
| 55 |
+
try:
|
| 56 |
+
# --- SIMPLIFIED: No language detection needed ---
|
| 57 |
+
# This model automatically handles different source languages.
|
| 58 |
+
translated_text = translator(sentence)[0]['translation_text']
|
| 59 |
+
|
| 60 |
+
print(f"Original: '{sentence}'")
|
| 61 |
+
print(f"Translated: '{translated_text}'")
|
| 62 |
+
print("-" * 25)
|
| 63 |
+
|
| 64 |
+
except Exception as e:
|
| 65 |
+
print(f"Could not process sentence: '{sentence}'. Error: {e}")
|
| 66 |
+
|
| 67 |
+
if __name__ == "__main__":
|
| 68 |
+
translate_indian_languages()
|
| 69 |
+
|