Spaces:
Build error
Build error
Upload 8 files
Browse files- .env.example +17 -0
- .gitignore +18 -0
- Dockerfile +24 -0
- api.py +72 -0
- app.py +431 -0
- config.py +85 -0
- requirements.txt +21 -0
- utils.py +1402 -0
.env.example
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# API Settings
|
| 2 |
+
API_HOST=0.0.0.0
|
| 3 |
+
API_PORT=8005
|
| 4 |
+
API_BASE_URL=http://0.0.0.0:8005
|
| 5 |
+
|
| 6 |
+
# News Scraping Settings
|
| 7 |
+
ARTICLES_PER_SOURCE=10
|
| 8 |
+
USER_AGENT=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36
|
| 9 |
+
|
| 10 |
+
# Cache Settings
|
| 11 |
+
CACHE_DIR=.cache
|
| 12 |
+
CACHE_EXPIRY=3600
|
| 13 |
+
CACHE_DURATION=300
|
| 14 |
+
|
| 15 |
+
# Audio Settings
|
| 16 |
+
AUDIO_OUTPUT_DIR=audio_output
|
| 17 |
+
DEFAULT_LANG=hi
|
.gitignore
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# Ignore virtual environment
|
| 3 |
+
venv/
|
| 4 |
+
.env
|
| 5 |
+
audio_output/
|
| 6 |
+
|
| 7 |
+
# Ignore compiled Python files
|
| 8 |
+
__pycache__/
|
| 9 |
+
*.pyc
|
| 10 |
+
*.pyo
|
| 11 |
+
*.pyd
|
| 12 |
+
sentiment_history/
|
| 13 |
+
# Ignore macOS system files
|
| 14 |
+
.DS_Store
|
| 15 |
+
|
| 16 |
+
# Ignore log files
|
| 17 |
+
*.log
|
| 18 |
+
audio_output
|
Dockerfile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.9-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
# Install system dependencies
|
| 6 |
+
RUN apt-get update && apt-get install -y \
|
| 7 |
+
build-essential \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Copy requirements first to leverage Docker cache
|
| 11 |
+
COPY requirements.txt .
|
| 12 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 13 |
+
|
| 14 |
+
# Copy the rest of the application
|
| 15 |
+
COPY . .
|
| 16 |
+
|
| 17 |
+
# Create necessary directories
|
| 18 |
+
RUN mkdir -p audio_output sentiment_history
|
| 19 |
+
|
| 20 |
+
# Expose the port Streamlit will run on
|
| 21 |
+
EXPOSE 8501
|
| 22 |
+
|
| 23 |
+
# Command to run the application
|
| 24 |
+
CMD ["streamlit", "run", "app.py", "--server.address", "0.0.0.0"]
|
api.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI backend for the News Summarization application."""
|
| 2 |
+
|
| 3 |
+
from fastapi import FastAPI, HTTPException
|
| 4 |
+
from fastapi.staticfiles import StaticFiles
|
| 5 |
+
from pydantic import BaseModel
|
| 6 |
+
from typing import List, Dict, Any
|
| 7 |
+
import uvicorn
|
| 8 |
+
from utils import NewsExtractor, SentimentAnalyzer, TextToSpeechConverter, ComparativeAnalyzer
|
| 9 |
+
import os
|
| 10 |
+
from config import API_PORT, AUDIO_OUTPUT_DIR
|
| 11 |
+
import time
|
| 12 |
+
|
| 13 |
+
app = FastAPI(title="News Summarization API")
|
| 14 |
+
|
| 15 |
+
# Mount static directory for audio files
|
| 16 |
+
os.makedirs(AUDIO_OUTPUT_DIR, exist_ok=True)
|
| 17 |
+
app.mount("/audio", StaticFiles(directory=AUDIO_OUTPUT_DIR), name="audio")
|
| 18 |
+
|
| 19 |
+
# Initialize components
|
| 20 |
+
news_extractor = NewsExtractor()
|
| 21 |
+
sentiment_analyzer = SentimentAnalyzer()
|
| 22 |
+
tts_converter = TextToSpeechConverter()
|
| 23 |
+
comparative_analyzer = ComparativeAnalyzer()
|
| 24 |
+
|
| 25 |
+
class CompanyRequest(BaseModel):
|
| 26 |
+
name: str
|
| 27 |
+
|
| 28 |
+
class AnalysisResponse(BaseModel):
|
| 29 |
+
company: str
|
| 30 |
+
articles: List[Dict[str, Any]]
|
| 31 |
+
comparative_sentiment_score: Dict[str, Any]
|
| 32 |
+
final_sentiment_analysis: str
|
| 33 |
+
audio_url: str = None
|
| 34 |
+
|
| 35 |
+
@app.post("/api/analyze", response_model=AnalysisResponse)
|
| 36 |
+
async def analyze_company(request: CompanyRequest):
|
| 37 |
+
"""Analyze news articles for a given company."""
|
| 38 |
+
try:
|
| 39 |
+
# Extract news articles
|
| 40 |
+
articles = news_extractor.search_news(request.name)
|
| 41 |
+
if not articles:
|
| 42 |
+
raise HTTPException(status_code=404, detail="No articles found for the company")
|
| 43 |
+
|
| 44 |
+
# Analyze each article
|
| 45 |
+
analyzed_articles = []
|
| 46 |
+
for article in articles:
|
| 47 |
+
analysis = sentiment_analyzer.analyze_article(article)
|
| 48 |
+
# Add company name to each article
|
| 49 |
+
analysis['company'] = request.name
|
| 50 |
+
analyzed_articles.append(analysis)
|
| 51 |
+
|
| 52 |
+
# Perform comparative analysis
|
| 53 |
+
comparison = comparative_analyzer.analyze_coverage(analyzed_articles, company_name=request.name)
|
| 54 |
+
final_analysis = comparison["final_sentiment"]
|
| 55 |
+
|
| 56 |
+
# Generate Hindi audio for final analysis
|
| 57 |
+
audio_filename = f"{request.name.lower().replace(' ', '_')}_{int(time.time())}"
|
| 58 |
+
audio_path = tts_converter.generate_audio(final_analysis, audio_filename)
|
| 59 |
+
audio_url = f"/audio/{os.path.basename(audio_path)}"
|
| 60 |
+
|
| 61 |
+
return {
|
| 62 |
+
"company": request.name,
|
| 63 |
+
"articles": analyzed_articles,
|
| 64 |
+
"comparative_sentiment_score": comparison,
|
| 65 |
+
"final_sentiment_analysis": final_analysis,
|
| 66 |
+
"audio_url": audio_url
|
| 67 |
+
}
|
| 68 |
+
except Exception as e:
|
| 69 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
uvicorn.run(app, host="0.0.0.0", port=API_PORT)
|
app.py
ADDED
|
@@ -0,0 +1,431 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Streamlit frontend for the News Summarization application."""
|
| 2 |
+
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import requests
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import json
|
| 7 |
+
from config import API_BASE_URL
|
| 8 |
+
import os
|
| 9 |
+
import plotly.express as px
|
| 10 |
+
import altair as alt
|
| 11 |
+
|
| 12 |
+
st.set_page_config(
|
| 13 |
+
page_title="News Summarization App",
|
| 14 |
+
page_icon="📰",
|
| 15 |
+
layout="wide"
|
| 16 |
+
)
|
| 17 |
+
|
| 18 |
+
def analyze_company(company_name):
|
| 19 |
+
"""Send analysis request to API."""
|
| 20 |
+
try:
|
| 21 |
+
response = requests.post(
|
| 22 |
+
f"{API_BASE_URL}/api/analyze",
|
| 23 |
+
json={"name": company_name}
|
| 24 |
+
)
|
| 25 |
+
if response.status_code == 200:
|
| 26 |
+
data = response.json()
|
| 27 |
+
# Print the response data for debugging
|
| 28 |
+
print("API Response Data:")
|
| 29 |
+
print(json.dumps(data, indent=2))
|
| 30 |
+
|
| 31 |
+
# Download audio file if available
|
| 32 |
+
if 'audio_url' in data:
|
| 33 |
+
audio_response = requests.get(f"{API_BASE_URL}{data['audio_url']}")
|
| 34 |
+
if audio_response.status_code == 200:
|
| 35 |
+
data['audio_content'] = audio_response.content
|
| 36 |
+
return data
|
| 37 |
+
else:
|
| 38 |
+
st.error(f"Error from API: {response.text}")
|
| 39 |
+
return {"articles": [], "comparative_sentiment_score": {}, "final_sentiment_analysis": "", "audio_url": None}
|
| 40 |
+
except Exception as e:
|
| 41 |
+
st.error(f"Error analyzing company: {str(e)}")
|
| 42 |
+
return {"articles": [], "comparative_sentiment_score": {}, "final_sentiment_analysis": "", "audio_url": None}
|
| 43 |
+
|
| 44 |
+
def main():
|
| 45 |
+
st.title("📰 News Summarization and Analysis")
|
| 46 |
+
|
| 47 |
+
# Sidebar
|
| 48 |
+
st.sidebar.header("Settings")
|
| 49 |
+
|
| 50 |
+
# Replace dropdown with text input
|
| 51 |
+
company = st.sidebar.text_input(
|
| 52 |
+
"Enter Company Name",
|
| 53 |
+
placeholder="e.g., Tesla, Apple, Microsoft, or any other company",
|
| 54 |
+
help="Enter the name of any company you want to analyze"
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
if st.sidebar.button("Analyze") and company:
|
| 58 |
+
if len(company.strip()) < 2:
|
| 59 |
+
st.sidebar.error("Please enter a valid company name (at least 2 characters)")
|
| 60 |
+
else:
|
| 61 |
+
with st.spinner("Analyzing news articles..."):
|
| 62 |
+
result = analyze_company(company)
|
| 63 |
+
|
| 64 |
+
if result and result.get("articles"):
|
| 65 |
+
# Display Articles
|
| 66 |
+
st.header("📑 News Articles")
|
| 67 |
+
for idx, article in enumerate(result["articles"], 1):
|
| 68 |
+
with st.expander(f"Article {idx}: {article['title']}"):
|
| 69 |
+
st.write("**Content:**", article.get("content", "No content available"))
|
| 70 |
+
if "summary" in article:
|
| 71 |
+
st.write("**Summary:**", article["summary"])
|
| 72 |
+
st.write("**Source:**", article.get("source", "Unknown"))
|
| 73 |
+
|
| 74 |
+
# Enhanced sentiment display
|
| 75 |
+
if "sentiment" in article:
|
| 76 |
+
sentiment_col1, sentiment_col2 = st.columns(2)
|
| 77 |
+
with sentiment_col1:
|
| 78 |
+
st.write("**Sentiment:**", article["sentiment"])
|
| 79 |
+
st.write("**Confidence Score:**", f"{article.get('sentiment_score', 0)*100:.1f}%")
|
| 80 |
+
|
| 81 |
+
with sentiment_col2:
|
| 82 |
+
# Display fine-grained sentiment if available
|
| 83 |
+
if "fine_grained_sentiment" in article and article["fine_grained_sentiment"]:
|
| 84 |
+
fine_grained = article["fine_grained_sentiment"]
|
| 85 |
+
if "category" in fine_grained:
|
| 86 |
+
st.write("**Detailed Sentiment:**", fine_grained["category"])
|
| 87 |
+
if "confidence" in fine_grained:
|
| 88 |
+
st.write("**Confidence:**", f"{fine_grained['confidence']*100:.1f}%")
|
| 89 |
+
|
| 90 |
+
# Display sentiment indices if available
|
| 91 |
+
if "sentiment_indices" in article and article["sentiment_indices"]:
|
| 92 |
+
st.markdown("**Sentiment Indices:**")
|
| 93 |
+
indices = article["sentiment_indices"]
|
| 94 |
+
|
| 95 |
+
# Create columns for displaying indices
|
| 96 |
+
idx_cols = st.columns(3)
|
| 97 |
+
|
| 98 |
+
# Display positivity and negativity in first column
|
| 99 |
+
with idx_cols[0]:
|
| 100 |
+
if "positivity_index" in indices:
|
| 101 |
+
st.markdown(f"**Positivity:** {indices['positivity_index']:.2f}")
|
| 102 |
+
if "negativity_index" in indices:
|
| 103 |
+
st.markdown(f"**Negativity:** {indices['negativity_index']:.2f}")
|
| 104 |
+
|
| 105 |
+
# Display emotional intensity and controversy in second column
|
| 106 |
+
with idx_cols[1]:
|
| 107 |
+
if "emotional_intensity" in indices:
|
| 108 |
+
st.markdown(f"**Emotional Intensity:** {indices['emotional_intensity']:.2f}")
|
| 109 |
+
if "controversy_score" in indices:
|
| 110 |
+
st.markdown(f"**Controversy:** {indices['controversy_score']:.2f}")
|
| 111 |
+
|
| 112 |
+
# Display confidence and ESG in third column
|
| 113 |
+
with idx_cols[2]:
|
| 114 |
+
if "confidence_score" in indices:
|
| 115 |
+
st.markdown(f"**Confidence:** {indices['confidence_score']:.2f}")
|
| 116 |
+
if "esg_relevance" in indices:
|
| 117 |
+
st.markdown(f"**ESG Relevance:** {indices['esg_relevance']:.2f}")
|
| 118 |
+
|
| 119 |
+
# Display entities if available
|
| 120 |
+
if "entities" in article and article["entities"]:
|
| 121 |
+
st.markdown("**Named Entities:**")
|
| 122 |
+
entities = article["entities"]
|
| 123 |
+
|
| 124 |
+
# Organizations
|
| 125 |
+
if "ORG" in entities and entities["ORG"]:
|
| 126 |
+
st.write("**Organizations:**", ", ".join(entities["ORG"]))
|
| 127 |
+
|
| 128 |
+
# People
|
| 129 |
+
if "PERSON" in entities and entities["PERSON"]:
|
| 130 |
+
st.write("**People:**", ", ".join(entities["PERSON"]))
|
| 131 |
+
|
| 132 |
+
# Locations
|
| 133 |
+
if "GPE" in entities and entities["GPE"]:
|
| 134 |
+
st.write("**Locations:**", ", ".join(entities["GPE"]))
|
| 135 |
+
|
| 136 |
+
# Money
|
| 137 |
+
if "MONEY" in entities and entities["MONEY"]:
|
| 138 |
+
st.write("**Financial Values:**", ", ".join(entities["MONEY"]))
|
| 139 |
+
|
| 140 |
+
# Display sentiment targets if available
|
| 141 |
+
if "sentiment_targets" in article and article["sentiment_targets"]:
|
| 142 |
+
st.markdown("**Sentiment Targets:**")
|
| 143 |
+
targets = article["sentiment_targets"]
|
| 144 |
+
for target in targets:
|
| 145 |
+
st.markdown(f"**{target['entity']}** ({target['type']}): {target['sentiment']} ({target['confidence']*100:.1f}%)")
|
| 146 |
+
st.markdown(f"> {target['context']}")
|
| 147 |
+
st.markdown("---")
|
| 148 |
+
|
| 149 |
+
if "url" in article:
|
| 150 |
+
st.write("**[Read More](%s)**" % article["url"])
|
| 151 |
+
|
| 152 |
+
# Display Comparative Analysis
|
| 153 |
+
st.header("📊 Comparative Analysis")
|
| 154 |
+
analysis = result.get("comparative_sentiment_score", {})
|
| 155 |
+
|
| 156 |
+
# Sentiment Distribution
|
| 157 |
+
if "sentiment_distribution" in analysis:
|
| 158 |
+
st.subheader("Sentiment Distribution")
|
| 159 |
+
|
| 160 |
+
# Debug: Print sentiment distribution data
|
| 161 |
+
print("Sentiment Distribution Data:")
|
| 162 |
+
print(json.dumps(analysis["sentiment_distribution"], indent=2))
|
| 163 |
+
|
| 164 |
+
sentiment_dist = analysis["sentiment_distribution"]
|
| 165 |
+
|
| 166 |
+
# Create a very simple visualization that will definitely work
|
| 167 |
+
try:
|
| 168 |
+
# Extract basic sentiment data
|
| 169 |
+
if isinstance(sentiment_dist, dict):
|
| 170 |
+
if "basic" in sentiment_dist and isinstance(sentiment_dist["basic"], dict):
|
| 171 |
+
basic_dist = sentiment_dist["basic"]
|
| 172 |
+
elif any(k in sentiment_dist for k in ['positive', 'negative', 'neutral']):
|
| 173 |
+
basic_dist = {k: v for k, v in sentiment_dist.items()
|
| 174 |
+
if k in ['positive', 'negative', 'neutral']}
|
| 175 |
+
else:
|
| 176 |
+
basic_dist = {'positive': 0, 'negative': 0, 'neutral': 1}
|
| 177 |
+
else:
|
| 178 |
+
basic_dist = {'positive': 0, 'negative': 0, 'neutral': 1}
|
| 179 |
+
|
| 180 |
+
# Calculate percentages
|
| 181 |
+
total_articles = sum(basic_dist.values())
|
| 182 |
+
if total_articles > 0:
|
| 183 |
+
percentages = {
|
| 184 |
+
k: (v / total_articles) * 100
|
| 185 |
+
for k, v in basic_dist.items()
|
| 186 |
+
}
|
| 187 |
+
else:
|
| 188 |
+
percentages = {k: 0 for k in basic_dist}
|
| 189 |
+
|
| 190 |
+
# Display as simple text and metrics
|
| 191 |
+
st.write("**Sentiment Distribution:**")
|
| 192 |
+
|
| 193 |
+
col1, col2, col3 = st.columns(3)
|
| 194 |
+
with col1:
|
| 195 |
+
st.metric(
|
| 196 |
+
"Positive",
|
| 197 |
+
basic_dist.get('positive', 0),
|
| 198 |
+
f"{percentages.get('positive', 0):.1f}%"
|
| 199 |
+
)
|
| 200 |
+
with col2:
|
| 201 |
+
st.metric(
|
| 202 |
+
"Negative",
|
| 203 |
+
basic_dist.get('negative', 0),
|
| 204 |
+
f"{percentages.get('negative', 0):.1f}%"
|
| 205 |
+
)
|
| 206 |
+
with col3:
|
| 207 |
+
st.metric(
|
| 208 |
+
"Neutral",
|
| 209 |
+
basic_dist.get('neutral', 0),
|
| 210 |
+
f"{percentages.get('neutral', 0):.1f}%"
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
# Create a simple bar chart using Altair
|
| 214 |
+
|
| 215 |
+
# Create a simple DataFrame with consistent capitalization and percentages
|
| 216 |
+
chart_data = pd.DataFrame({
|
| 217 |
+
'Sentiment': ['Positive', 'Negative', 'Neutral'],
|
| 218 |
+
'Count': [
|
| 219 |
+
basic_dist.get('positive', 0), # Map lowercase keys to capitalized display
|
| 220 |
+
basic_dist.get('negative', 0),
|
| 221 |
+
basic_dist.get('neutral', 0)
|
| 222 |
+
],
|
| 223 |
+
'Percentage': [
|
| 224 |
+
f"{percentages.get('positive', 0):.1f}%",
|
| 225 |
+
f"{percentages.get('negative', 0):.1f}%",
|
| 226 |
+
f"{percentages.get('neutral', 0):.1f}%"
|
| 227 |
+
]
|
| 228 |
+
})
|
| 229 |
+
|
| 230 |
+
# Add debug output to see what's in the data
|
| 231 |
+
print("Chart Data for Sentiment Distribution:")
|
| 232 |
+
print(chart_data)
|
| 233 |
+
|
| 234 |
+
# Create a simple bar chart with percentages
|
| 235 |
+
chart = alt.Chart(chart_data).mark_bar().encode(
|
| 236 |
+
y='Sentiment', # Changed from x to y for horizontal bars
|
| 237 |
+
x='Count', # Changed from y to x for horizontal bars
|
| 238 |
+
color=alt.Color('Sentiment', scale=alt.Scale(
|
| 239 |
+
domain=['Positive', 'Negative', 'Neutral'],
|
| 240 |
+
range=['green', 'red', 'gray']
|
| 241 |
+
)),
|
| 242 |
+
tooltip=['Sentiment', 'Count', 'Percentage'] # Add tooltip with percentage
|
| 243 |
+
).properties(
|
| 244 |
+
width=600,
|
| 245 |
+
height=300
|
| 246 |
+
)
|
| 247 |
+
|
| 248 |
+
# Add text labels with percentages
|
| 249 |
+
text = chart.mark_text(
|
| 250 |
+
align='left',
|
| 251 |
+
baseline='middle',
|
| 252 |
+
dx=3 # Nudge text to the right so it doesn't overlap with the bar
|
| 253 |
+
).encode(
|
| 254 |
+
text='Percentage'
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
# Combine the chart and text
|
| 258 |
+
chart_with_text = (chart + text)
|
| 259 |
+
|
| 260 |
+
st.altair_chart(chart_with_text, use_container_width=True)
|
| 261 |
+
|
| 262 |
+
except Exception as e:
|
| 263 |
+
st.error(f"Error creating visualization: {str(e)}")
|
| 264 |
+
st.write("Fallback to simple text display:")
|
| 265 |
+
if isinstance(sentiment_dist, dict):
|
| 266 |
+
if "basic" in sentiment_dist:
|
| 267 |
+
st.write(f"Positive: {sentiment_dist['basic'].get('positive', 0)}")
|
| 268 |
+
st.write(f"Negative: {sentiment_dist['basic'].get('negative', 0)}")
|
| 269 |
+
st.write(f"Neutral: {sentiment_dist['basic'].get('neutral', 0)}")
|
| 270 |
+
else:
|
| 271 |
+
st.write(f"Positive: {sentiment_dist.get('positive', 0)}")
|
| 272 |
+
st.write(f"Negative: {sentiment_dist.get('negative', 0)}")
|
| 273 |
+
st.write(f"Neutral: {sentiment_dist.get('neutral', 0)}")
|
| 274 |
+
else:
|
| 275 |
+
st.write("No valid sentiment data available")
|
| 276 |
+
|
| 277 |
+
# Display sentiment indices if available
|
| 278 |
+
if "sentiment_indices" in analysis and analysis["sentiment_indices"]:
|
| 279 |
+
st.subheader("Sentiment Indices")
|
| 280 |
+
|
| 281 |
+
# Debug: Print sentiment indices
|
| 282 |
+
print("Sentiment Indices:")
|
| 283 |
+
print(json.dumps(analysis["sentiment_indices"], indent=2))
|
| 284 |
+
|
| 285 |
+
# Get the indices data
|
| 286 |
+
indices = analysis["sentiment_indices"]
|
| 287 |
+
|
| 288 |
+
# Create a very simple visualization that will definitely work
|
| 289 |
+
try:
|
| 290 |
+
if isinstance(indices, dict):
|
| 291 |
+
# Display as simple metrics in columns
|
| 292 |
+
cols = st.columns(3)
|
| 293 |
+
|
| 294 |
+
# Define display names and descriptions
|
| 295 |
+
display_names = {
|
| 296 |
+
"positivity_index": "Positivity",
|
| 297 |
+
"negativity_index": "Negativity",
|
| 298 |
+
"emotional_intensity": "Emotional Intensity",
|
| 299 |
+
"controversy_score": "Controversy",
|
| 300 |
+
"confidence_score": "Confidence",
|
| 301 |
+
"esg_relevance": "ESG Relevance"
|
| 302 |
+
}
|
| 303 |
+
|
| 304 |
+
# Display each index as a metric
|
| 305 |
+
for i, (key, value) in enumerate(indices.items()):
|
| 306 |
+
if isinstance(value, (int, float)):
|
| 307 |
+
with cols[i % 3]:
|
| 308 |
+
display_name = display_names.get(key, key.replace("_", " ").title())
|
| 309 |
+
st.metric(display_name, f"{value:.2f}")
|
| 310 |
+
|
| 311 |
+
# Create a simple bar chart using Altair
|
| 312 |
+
|
| 313 |
+
# Create a simple DataFrame
|
| 314 |
+
chart_data = pd.DataFrame({
|
| 315 |
+
'Index': [display_names.get(k, k.replace("_", " ").title()) for k in indices.keys()],
|
| 316 |
+
'Value': [v if isinstance(v, (int, float)) else 0 for v in indices.values()]
|
| 317 |
+
})
|
| 318 |
+
|
| 319 |
+
# Create a simple bar chart
|
| 320 |
+
chart = alt.Chart(chart_data).mark_bar().encode(
|
| 321 |
+
x='Value',
|
| 322 |
+
y='Index',
|
| 323 |
+
color=alt.Color('Index')
|
| 324 |
+
).properties(
|
| 325 |
+
width=600,
|
| 326 |
+
height=300
|
| 327 |
+
)
|
| 328 |
+
|
| 329 |
+
st.altair_chart(chart, use_container_width=True)
|
| 330 |
+
|
| 331 |
+
# Add descriptions
|
| 332 |
+
with st.expander("Sentiment Indices Explained"):
|
| 333 |
+
st.markdown("""
|
| 334 |
+
- **Positivity**: Measures the positive sentiment in the articles (0-1)
|
| 335 |
+
- **Negativity**: Measures the negative sentiment in the articles (0-1)
|
| 336 |
+
- **Emotional Intensity**: Measures the overall emotional content (0-1)
|
| 337 |
+
- **Controversy**: High when both positive and negative sentiments are strong (0-1)
|
| 338 |
+
- **Confidence**: Confidence in the sentiment analysis (0-1)
|
| 339 |
+
- **ESG Relevance**: Relevance to Environmental, Social, and Governance topics (0-1)
|
| 340 |
+
""")
|
| 341 |
+
else:
|
| 342 |
+
st.warning("Sentiment indices data is not in the expected format.")
|
| 343 |
+
st.write("No valid sentiment indices available")
|
| 344 |
+
except Exception as e:
|
| 345 |
+
st.error(f"Error creating indices visualization: {str(e)}")
|
| 346 |
+
st.write("Fallback to simple text display:")
|
| 347 |
+
if isinstance(indices, dict):
|
| 348 |
+
for key, value in indices.items():
|
| 349 |
+
if isinstance(value, (int, float)):
|
| 350 |
+
st.write(f"{key.replace('_', ' ').title()}: {value:.2f}")
|
| 351 |
+
else:
|
| 352 |
+
st.write("No valid sentiment indices data available")
|
| 353 |
+
|
| 354 |
+
# Source Distribution
|
| 355 |
+
if "source_distribution" in analysis:
|
| 356 |
+
st.subheader("Source Distribution")
|
| 357 |
+
source_df = pd.DataFrame.from_dict(
|
| 358 |
+
analysis["source_distribution"],
|
| 359 |
+
orient='index',
|
| 360 |
+
columns=['Count']
|
| 361 |
+
)
|
| 362 |
+
st.bar_chart(source_df)
|
| 363 |
+
|
| 364 |
+
# Common Topics
|
| 365 |
+
if "common_topics" in analysis:
|
| 366 |
+
st.subheader("Common Topics")
|
| 367 |
+
st.write(", ".join(analysis["common_topics"]) if analysis["common_topics"] else "No common topics found")
|
| 368 |
+
|
| 369 |
+
# Coverage Differences
|
| 370 |
+
if "coverage_differences" in analysis:
|
| 371 |
+
st.subheader("Coverage Analysis")
|
| 372 |
+
for diff in analysis["coverage_differences"]:
|
| 373 |
+
st.write("- " + diff)
|
| 374 |
+
|
| 375 |
+
# Display Final Sentiment and Audio
|
| 376 |
+
st.header("🎯 Final Analysis")
|
| 377 |
+
if "final_sentiment_analysis" in result:
|
| 378 |
+
st.write(result["final_sentiment_analysis"])
|
| 379 |
+
|
| 380 |
+
# Display sentiment indices in the sidebar if available
|
| 381 |
+
if "sentiment_indices" in analysis and analysis["sentiment_indices"]:
|
| 382 |
+
indices = analysis["sentiment_indices"]
|
| 383 |
+
# Verify we have valid data
|
| 384 |
+
if indices and any(isinstance(v, (int, float)) for v in indices.values()):
|
| 385 |
+
st.sidebar.markdown("### Sentiment Indices")
|
| 386 |
+
for idx_name, idx_value in indices.items():
|
| 387 |
+
if isinstance(idx_value, (int, float)):
|
| 388 |
+
formatted_name = " ".join(word.capitalize() for word in idx_name.replace("_", " ").split())
|
| 389 |
+
st.sidebar.metric(formatted_name, f"{idx_value:.2f}")
|
| 390 |
+
|
| 391 |
+
# Display ensemble model information if available
|
| 392 |
+
if "ensemble_info" in result:
|
| 393 |
+
with st.expander("Ensemble Model Details"):
|
| 394 |
+
ensemble = result["ensemble_info"]
|
| 395 |
+
|
| 396 |
+
# Model agreement
|
| 397 |
+
if "agreement" in ensemble:
|
| 398 |
+
st.metric("Model Agreement", f"{ensemble['agreement']*100:.1f}%")
|
| 399 |
+
|
| 400 |
+
# Individual model results
|
| 401 |
+
if "models" in ensemble:
|
| 402 |
+
st.subheader("Individual Model Results")
|
| 403 |
+
models_data = []
|
| 404 |
+
for model_name, model_info in ensemble["models"].items():
|
| 405 |
+
models_data.append({
|
| 406 |
+
"Model": model_name,
|
| 407 |
+
"Sentiment": model_info.get("sentiment", "N/A"),
|
| 408 |
+
"Confidence": f"{model_info.get('confidence', 0)*100:.1f}%"
|
| 409 |
+
})
|
| 410 |
+
|
| 411 |
+
if models_data:
|
| 412 |
+
st.table(pd.DataFrame(models_data))
|
| 413 |
+
|
| 414 |
+
# Audio Playback Section
|
| 415 |
+
st.subheader("🔊 Listen to Analysis (Hindi)")
|
| 416 |
+
if 'audio_content' in result:
|
| 417 |
+
st.audio(result['audio_content'], format='audio/mp3')
|
| 418 |
+
else:
|
| 419 |
+
st.warning("Hindi audio summary not available")
|
| 420 |
+
|
| 421 |
+
# Total Articles
|
| 422 |
+
if "total_articles" in analysis:
|
| 423 |
+
st.sidebar.info(f"Found {analysis['total_articles']} articles")
|
| 424 |
+
|
| 425 |
+
# Add a disclaimer
|
| 426 |
+
st.sidebar.markdown("---")
|
| 427 |
+
st.sidebar.markdown("### About")
|
| 428 |
+
st.sidebar.write("This app analyzes news articles and provides sentiment analysis for any company.")
|
| 429 |
+
|
| 430 |
+
if __name__ == "__main__":
|
| 431 |
+
main()
|
config.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration settings for the News Summarization application."""
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from dotenv import load_dotenv
|
| 5 |
+
|
| 6 |
+
# Load environment variables
|
| 7 |
+
load_dotenv()
|
| 8 |
+
|
| 9 |
+
# API Settings
|
| 10 |
+
API_HOST = os.getenv("API_HOST", "0.0.0.0")
|
| 11 |
+
API_PORT = int(os.getenv("API_PORT", "8005"))
|
| 12 |
+
API_BASE_URL = os.getenv("API_BASE_URL", f"http://{API_HOST}:{API_PORT}")
|
| 13 |
+
|
| 14 |
+
# News Scraping Settings
|
| 15 |
+
ARTICLES_PER_SOURCE = int(os.getenv("ARTICLES_PER_SOURCE", "10"))
|
| 16 |
+
USER_AGENT = os.getenv("USER_AGENT", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
| 17 |
+
|
| 18 |
+
# RSS Feed Settings
|
| 19 |
+
RSS_FEEDS = {
|
| 20 |
+
"BBC": "http://feeds.bbci.co.uk/news/business/rss.xml",
|
| 21 |
+
"CNN": "http://rss.cnn.com/rss/money_news_international.rss",
|
| 22 |
+
"FoxBusiness": "http://feeds.foxnews.com/foxbusiness/latest"
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
# Model Settings
|
| 26 |
+
SENTIMENT_MODEL = "yiyanghkust/finbert-tone" # More advanced financial sentiment model
|
| 27 |
+
SENTIMENT_FINE_GRAINED_MODEL = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
|
| 28 |
+
SUMMARIZATION_MODEL = "t5-base"
|
| 29 |
+
|
| 30 |
+
# Additional Fine-Grained Sentiment Models
|
| 31 |
+
FINE_GRAINED_MODELS = {
|
| 32 |
+
"financial": "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
|
| 33 |
+
"emotion": "j-hartmann/emotion-english-distilroberta-base",
|
| 34 |
+
"aspect": "yangheng/deberta-v3-base-absa-v1.1",
|
| 35 |
+
"esg": "yiyanghkust/finbert-esg",
|
| 36 |
+
"news_tone": "ProsusAI/finbert"
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
# Fine-Grained Sentiment Categories
|
| 40 |
+
SENTIMENT_CATEGORIES = {
|
| 41 |
+
"financial": ["positive", "negative", "neutral"],
|
| 42 |
+
"emotion": ["joy", "sadness", "anger", "fear", "surprise", "disgust", "neutral"],
|
| 43 |
+
"aspect": ["positive", "negative", "neutral"],
|
| 44 |
+
"esg": ["environmental", "social", "governance", "neutral"],
|
| 45 |
+
"news_tone": ["positive", "negative", "neutral"]
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# Cache Settings
|
| 49 |
+
CACHE_DIR = os.getenv("CACHE_DIR", ".cache")
|
| 50 |
+
CACHE_EXPIRY = int(os.getenv("CACHE_EXPIRY", "3600")) # 1 hour
|
| 51 |
+
CACHE_DURATION = int(os.getenv("CACHE_DURATION", "300")) # 5 minutes in seconds
|
| 52 |
+
|
| 53 |
+
# Audio Settings
|
| 54 |
+
AUDIO_OUTPUT_DIR = os.getenv("AUDIO_OUTPUT_DIR", "audio_output")
|
| 55 |
+
DEFAULT_LANG = os.getenv("DEFAULT_LANG", "hi") # Hindi
|
| 56 |
+
|
| 57 |
+
# News Sources
|
| 58 |
+
NEWS_SOURCES = {
|
| 59 |
+
# Major News Aggregators
|
| 60 |
+
"google": "https://www.google.com/search?q={}&tbm=nws",
|
| 61 |
+
"bing": "https://www.bing.com/news/search?q={}",
|
| 62 |
+
"yahoo": "https://news.search.yahoo.com/search?p={}",
|
| 63 |
+
|
| 64 |
+
# Financial News
|
| 65 |
+
"reuters": "https://www.reuters.com/search/news?blob={}",
|
| 66 |
+
"marketwatch": "https://www.marketwatch.com/search?q={}&ts=0&tab=All%20News",
|
| 67 |
+
"investing": "https://www.investing.com/search/?q={}&tab=news",
|
| 68 |
+
|
| 69 |
+
# Tech News
|
| 70 |
+
"techcrunch": "https://techcrunch.com/search/{}",
|
| 71 |
+
"zdnet": "https://www.zdnet.com/search/?q={}",
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# Article limits
|
| 75 |
+
MIN_ARTICLES = 20
|
| 76 |
+
MAX_ARTICLES_PER_SOURCE = 10 # Adjusted for more sources
|
| 77 |
+
MAX_ARTICLES = 50 # Increased to accommodate more sources
|
| 78 |
+
|
| 79 |
+
# Browser Headers
|
| 80 |
+
HEADERS = {
|
| 81 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
| 82 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
| 83 |
+
"Accept-Language": "en-US,en;q=0.5",
|
| 84 |
+
"Connection": "keep-alive"
|
| 85 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit==1.31.1
|
| 2 |
+
beautifulsoup4==4.12.2
|
| 3 |
+
requests==2.31.0
|
| 4 |
+
pandas==2.2.0
|
| 5 |
+
nltk==3.8.1
|
| 6 |
+
transformers==4.37.2
|
| 7 |
+
torch==2.2.0
|
| 8 |
+
fastapi==0.109.2
|
| 9 |
+
uvicorn==0.27.1
|
| 10 |
+
python-multipart==0.0.6
|
| 11 |
+
gTTS==2.5.0
|
| 12 |
+
scikit-learn==1.4.0
|
| 13 |
+
numpy==1.26.3
|
| 14 |
+
python-dotenv==1.0.1
|
| 15 |
+
aiofiles==23.2.1
|
| 16 |
+
googletrans==3.1.0a0
|
| 17 |
+
lxml==4.9.3
|
| 18 |
+
spacy==3.7.2
|
| 19 |
+
plotly==5.18.0
|
| 20 |
+
textblob==0.17.1
|
| 21 |
+
vaderSentiment==3.3.2
|
utils.py
ADDED
|
@@ -0,0 +1,1402 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Utility functions for news extraction, sentiment analysis, and text-to-speech."""
|
| 2 |
+
|
| 3 |
+
import requests
|
| 4 |
+
from bs4 import BeautifulSoup
|
| 5 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
|
| 6 |
+
from gtts import gTTS
|
| 7 |
+
import os
|
| 8 |
+
from typing import List, Dict, Any
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 11 |
+
from config import *
|
| 12 |
+
import re
|
| 13 |
+
from datetime import datetime, timedelta
|
| 14 |
+
import time
|
| 15 |
+
import json
|
| 16 |
+
from googletrans import Translator
|
| 17 |
+
import statistics
|
| 18 |
+
|
| 19 |
+
class NewsExtractor:
|
| 20 |
+
def __init__(self):
|
| 21 |
+
self.headers = HEADERS
|
| 22 |
+
|
| 23 |
+
def search_news(self, company_name: str) -> List[Dict[str, str]]:
|
| 24 |
+
"""Extract news articles about the company ensuring minimum count."""
|
| 25 |
+
all_articles = []
|
| 26 |
+
retries = 2 # Number of retries if we don't get enough articles
|
| 27 |
+
|
| 28 |
+
while retries > 0 and len(all_articles) < MIN_ARTICLES:
|
| 29 |
+
for source, url_template in NEWS_SOURCES.items():
|
| 30 |
+
try:
|
| 31 |
+
url = url_template.format(company_name.replace(" ", "+"))
|
| 32 |
+
print(f"\nSearching {source} for news about {company_name}...")
|
| 33 |
+
|
| 34 |
+
# Try different page numbers for more articles
|
| 35 |
+
for page in range(2): # Try first two pages
|
| 36 |
+
page_url = url
|
| 37 |
+
if page > 0:
|
| 38 |
+
if source == "google":
|
| 39 |
+
page_url += f"&start={page * 10}"
|
| 40 |
+
elif source == "bing":
|
| 41 |
+
page_url += f"&first={page * 10 + 1}"
|
| 42 |
+
elif source == "yahoo":
|
| 43 |
+
page_url += f"&b={page * 10 + 1}"
|
| 44 |
+
elif source == "reuters":
|
| 45 |
+
page_url += f"&page={page + 1}"
|
| 46 |
+
elif source == "marketwatch":
|
| 47 |
+
page_url += f"&page={page + 1}"
|
| 48 |
+
elif source == "investing":
|
| 49 |
+
page_url += f"&page={page + 1}"
|
| 50 |
+
elif source == "techcrunch":
|
| 51 |
+
page_url += f"/page/{page + 1}"
|
| 52 |
+
elif source == "zdnet":
|
| 53 |
+
page_url += f"&page={page + 1}"
|
| 54 |
+
|
| 55 |
+
response = requests.get(page_url, headers=self.headers, timeout=15)
|
| 56 |
+
if response.status_code != 200:
|
| 57 |
+
print(f"Error: {source} page {page+1} returned status code {response.status_code}")
|
| 58 |
+
continue
|
| 59 |
+
|
| 60 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 61 |
+
|
| 62 |
+
source_articles = []
|
| 63 |
+
if source == "google":
|
| 64 |
+
source_articles = self._parse_google_news(soup)
|
| 65 |
+
elif source == "bing":
|
| 66 |
+
source_articles = self._parse_bing_news(soup)
|
| 67 |
+
elif source == "yahoo":
|
| 68 |
+
source_articles = self._parse_yahoo_news(soup)
|
| 69 |
+
elif source == "reuters":
|
| 70 |
+
source_articles = self._parse_reuters_news(soup)
|
| 71 |
+
elif source == "marketwatch":
|
| 72 |
+
source_articles = self._parse_marketwatch_news(soup)
|
| 73 |
+
elif source == "investing":
|
| 74 |
+
source_articles = self._parse_investing_news(soup)
|
| 75 |
+
elif source == "techcrunch":
|
| 76 |
+
source_articles = self._parse_techcrunch_news(soup)
|
| 77 |
+
elif source == "zdnet":
|
| 78 |
+
source_articles = self._parse_zdnet_news(soup)
|
| 79 |
+
|
| 80 |
+
# Limit articles per source
|
| 81 |
+
if source_articles:
|
| 82 |
+
source_articles = source_articles[:MAX_ARTICLES_PER_SOURCE]
|
| 83 |
+
all_articles.extend(source_articles)
|
| 84 |
+
print(f"Found {len(source_articles)} articles from {source} page {page+1}")
|
| 85 |
+
|
| 86 |
+
# If we have enough articles, break the page loop
|
| 87 |
+
if len(all_articles) >= MIN_ARTICLES:
|
| 88 |
+
break
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
print(f"Error fetching from {source}: {str(e)}")
|
| 92 |
+
continue
|
| 93 |
+
|
| 94 |
+
# If we have enough articles, break the source loop
|
| 95 |
+
if len(all_articles) >= MIN_ARTICLES:
|
| 96 |
+
break
|
| 97 |
+
|
| 98 |
+
retries -= 1
|
| 99 |
+
if len(all_articles) < MIN_ARTICLES and retries > 0:
|
| 100 |
+
print(f"\nFound only {len(all_articles)} articles, retrying...")
|
| 101 |
+
|
| 102 |
+
# Remove duplicates
|
| 103 |
+
unique_articles = self._remove_duplicates(all_articles)
|
| 104 |
+
print(f"\nFound {len(unique_articles)} unique articles")
|
| 105 |
+
|
| 106 |
+
if len(unique_articles) < MIN_ARTICLES:
|
| 107 |
+
print(f"Warning: Could only find {len(unique_articles)} unique articles, fewer than minimum {MIN_ARTICLES}")
|
| 108 |
+
|
| 109 |
+
# Balance articles across sources
|
| 110 |
+
balanced_articles = self._balance_sources(unique_articles)
|
| 111 |
+
return balanced_articles[:max(MIN_ARTICLES, MAX_ARTICLES)]
|
| 112 |
+
|
| 113 |
+
def _balance_sources(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
|
| 114 |
+
"""Balance articles across sources while maintaining minimum count."""
|
| 115 |
+
source_articles = {}
|
| 116 |
+
|
| 117 |
+
# Group articles by source
|
| 118 |
+
for article in articles:
|
| 119 |
+
source = article['source']
|
| 120 |
+
if source not in source_articles:
|
| 121 |
+
source_articles[source] = []
|
| 122 |
+
source_articles[source].append(article)
|
| 123 |
+
|
| 124 |
+
# Calculate target articles per source
|
| 125 |
+
total_sources = len(source_articles)
|
| 126 |
+
target_per_source = max(MIN_ARTICLES // total_sources,
|
| 127 |
+
MAX_ARTICLES_PER_SOURCE)
|
| 128 |
+
|
| 129 |
+
# Get articles from each source
|
| 130 |
+
balanced = []
|
| 131 |
+
for source, articles_list in source_articles.items():
|
| 132 |
+
balanced.extend(articles_list[:target_per_source])
|
| 133 |
+
|
| 134 |
+
# If we still need more articles to meet minimum, add more from sources
|
| 135 |
+
# that have additional articles
|
| 136 |
+
if len(balanced) < MIN_ARTICLES:
|
| 137 |
+
remaining = []
|
| 138 |
+
for articles_list in source_articles.values():
|
| 139 |
+
remaining.extend(articles_list[target_per_source:])
|
| 140 |
+
|
| 141 |
+
# Sort remaining by source to maintain balance
|
| 142 |
+
remaining.sort(key=lambda x: len([a for a in balanced if a['source'] == x['source']]))
|
| 143 |
+
|
| 144 |
+
while len(balanced) < MIN_ARTICLES and remaining:
|
| 145 |
+
balanced.append(remaining.pop(0))
|
| 146 |
+
|
| 147 |
+
return balanced
|
| 148 |
+
|
| 149 |
+
def _parse_google_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
| 150 |
+
"""Parse Google News search results."""
|
| 151 |
+
articles = []
|
| 152 |
+
for div in soup.find_all(['div', 'article'], class_=['g', 'xuvV6b', 'WlydOe']):
|
| 153 |
+
try:
|
| 154 |
+
title_elem = div.find(['h3', 'h4'])
|
| 155 |
+
snippet_elem = div.find('div', class_=['VwiC3b', 'yy6M1d'])
|
| 156 |
+
link_elem = div.find('a')
|
| 157 |
+
source_elem = div.find(['div', 'span'], class_='UPmit')
|
| 158 |
+
|
| 159 |
+
if title_elem and snippet_elem and link_elem:
|
| 160 |
+
source = source_elem.get_text(strip=True) if source_elem else 'Google News'
|
| 161 |
+
articles.append({
|
| 162 |
+
'title': title_elem.get_text(strip=True),
|
| 163 |
+
'content': snippet_elem.get_text(strip=True),
|
| 164 |
+
'url': link_elem['href'],
|
| 165 |
+
'source': source
|
| 166 |
+
})
|
| 167 |
+
except Exception as e:
|
| 168 |
+
print(f"Error parsing Google article: {str(e)}")
|
| 169 |
+
continue
|
| 170 |
+
return articles
|
| 171 |
+
|
| 172 |
+
def _parse_bing_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
| 173 |
+
"""Parse Bing News search results."""
|
| 174 |
+
articles = []
|
| 175 |
+
for article in soup.find_all(['div', 'article'], class_=['news-card', 'newsitem', 'item-info']):
|
| 176 |
+
try:
|
| 177 |
+
title_elem = article.find(['a', 'h3'], class_=['title', 'news-card-title'])
|
| 178 |
+
snippet_elem = article.find(['div', 'p'], class_=['snippet', 'description'])
|
| 179 |
+
source_elem = article.find(['div', 'span'], class_=['source', 'provider'])
|
| 180 |
+
|
| 181 |
+
if title_elem and snippet_elem:
|
| 182 |
+
source = source_elem.get_text(strip=True) if source_elem else 'Bing News'
|
| 183 |
+
url = title_elem['href'] if 'href' in title_elem.attrs else ''
|
| 184 |
+
articles.append({
|
| 185 |
+
'title': title_elem.get_text(strip=True),
|
| 186 |
+
'content': snippet_elem.get_text(strip=True),
|
| 187 |
+
'url': url,
|
| 188 |
+
'source': source
|
| 189 |
+
})
|
| 190 |
+
except Exception as e:
|
| 191 |
+
print(f"Error parsing Bing article: {str(e)}")
|
| 192 |
+
return articles
|
| 193 |
+
|
| 194 |
+
def _parse_yahoo_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
| 195 |
+
"""Parse Yahoo News search results."""
|
| 196 |
+
articles = []
|
| 197 |
+
for article in soup.find_all('div', class_='NewsArticle'):
|
| 198 |
+
try:
|
| 199 |
+
title_elem = article.find(['h4', 'h3', 'a'])
|
| 200 |
+
snippet_elem = article.find('p')
|
| 201 |
+
source_elem = article.find(['span', 'div'], class_=['provider', 'source'])
|
| 202 |
+
|
| 203 |
+
if title_elem and snippet_elem:
|
| 204 |
+
source = source_elem.get_text(strip=True) if source_elem else 'Yahoo News'
|
| 205 |
+
url = title_elem.find('a')['href'] if title_elem.find('a') else ''
|
| 206 |
+
articles.append({
|
| 207 |
+
'title': title_elem.get_text(strip=True),
|
| 208 |
+
'content': snippet_elem.get_text(strip=True),
|
| 209 |
+
'url': url,
|
| 210 |
+
'source': source
|
| 211 |
+
})
|
| 212 |
+
except Exception as e:
|
| 213 |
+
print(f"Error parsing Yahoo article: {str(e)}")
|
| 214 |
+
return articles
|
| 215 |
+
|
| 216 |
+
def _parse_reuters_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
| 217 |
+
"""Parse Reuters search results."""
|
| 218 |
+
articles = []
|
| 219 |
+
for article in soup.find_all(['div', 'article'], class_=['search-result-content', 'story']):
|
| 220 |
+
try:
|
| 221 |
+
title_elem = article.find(['h3', 'a'], class_='story-title')
|
| 222 |
+
snippet_elem = article.find(['p', 'div'], class_=['story-description', 'description'])
|
| 223 |
+
|
| 224 |
+
if title_elem:
|
| 225 |
+
url = title_elem.find('a')['href'] if title_elem.find('a') else ''
|
| 226 |
+
if url and not url.startswith('http'):
|
| 227 |
+
url = 'https://www.reuters.com' + url
|
| 228 |
+
|
| 229 |
+
articles.append({
|
| 230 |
+
'title': title_elem.get_text(strip=True),
|
| 231 |
+
'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
|
| 232 |
+
'url': url,
|
| 233 |
+
'source': 'Reuters'
|
| 234 |
+
})
|
| 235 |
+
except Exception as e:
|
| 236 |
+
print(f"Error parsing Reuters article: {str(e)}")
|
| 237 |
+
return articles
|
| 238 |
+
|
| 239 |
+
def _parse_marketwatch_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
| 240 |
+
"""Parse MarketWatch search results."""
|
| 241 |
+
articles = []
|
| 242 |
+
for article in soup.find_all(['div', 'article'], class_=['element--article', 'article__content']):
|
| 243 |
+
try:
|
| 244 |
+
title_elem = article.find(['h3', 'h2'], class_=['article__headline', 'title'])
|
| 245 |
+
snippet_elem = article.find('p', class_=['article__summary', 'description'])
|
| 246 |
+
|
| 247 |
+
if title_elem:
|
| 248 |
+
url = title_elem.find('a')['href'] if title_elem.find('a') else ''
|
| 249 |
+
articles.append({
|
| 250 |
+
'title': title_elem.get_text(strip=True),
|
| 251 |
+
'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
|
| 252 |
+
'url': url,
|
| 253 |
+
'source': 'MarketWatch'
|
| 254 |
+
})
|
| 255 |
+
except Exception as e:
|
| 256 |
+
print(f"Error parsing MarketWatch article: {str(e)}")
|
| 257 |
+
return articles
|
| 258 |
+
|
| 259 |
+
def _parse_investing_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
| 260 |
+
"""Parse Investing.com search results."""
|
| 261 |
+
articles = []
|
| 262 |
+
for article in soup.find_all(['div', 'article'], class_=['articleItem', 'news-item']):
|
| 263 |
+
try:
|
| 264 |
+
title_elem = article.find(['a', 'h3'], class_=['title', 'articleTitle'])
|
| 265 |
+
snippet_elem = article.find(['p', 'div'], class_=['description', 'articleContent'])
|
| 266 |
+
|
| 267 |
+
if title_elem:
|
| 268 |
+
url = title_elem['href'] if 'href' in title_elem.attrs else title_elem.find('a')['href']
|
| 269 |
+
if url and not url.startswith('http'):
|
| 270 |
+
url = 'https://www.investing.com' + url
|
| 271 |
+
|
| 272 |
+
articles.append({
|
| 273 |
+
'title': title_elem.get_text(strip=True),
|
| 274 |
+
'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
|
| 275 |
+
'url': url,
|
| 276 |
+
'source': 'Investing.com'
|
| 277 |
+
})
|
| 278 |
+
except Exception as e:
|
| 279 |
+
print(f"Error parsing Investing.com article: {str(e)}")
|
| 280 |
+
return articles
|
| 281 |
+
|
| 282 |
+
def _parse_techcrunch_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
| 283 |
+
"""Parse TechCrunch search results."""
|
| 284 |
+
articles = []
|
| 285 |
+
for article in soup.find_all(['div', 'article'], class_=['post-block', 'article-block']):
|
| 286 |
+
try:
|
| 287 |
+
title_elem = article.find(['h2', 'h3', 'a'], class_=['post-block__title', 'article-title'])
|
| 288 |
+
snippet_elem = article.find(['div', 'p'], class_=['post-block__content', 'article-content'])
|
| 289 |
+
|
| 290 |
+
if title_elem:
|
| 291 |
+
url = title_elem.find('a')['href'] if title_elem.find('a') else ''
|
| 292 |
+
articles.append({
|
| 293 |
+
'title': title_elem.get_text(strip=True),
|
| 294 |
+
'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
|
| 295 |
+
'url': url,
|
| 296 |
+
'source': 'TechCrunch'
|
| 297 |
+
})
|
| 298 |
+
except Exception as e:
|
| 299 |
+
print(f"Error parsing TechCrunch article: {str(e)}")
|
| 300 |
+
return articles
|
| 301 |
+
|
| 302 |
+
def _parse_zdnet_news(self, soup: BeautifulSoup) -> List[Dict[str, str]]:
|
| 303 |
+
"""Parse ZDNet search results."""
|
| 304 |
+
articles = []
|
| 305 |
+
for article in soup.find_all(['div', 'article'], class_=['item', 'article']):
|
| 306 |
+
try:
|
| 307 |
+
title_elem = article.find(['h3', 'a'], class_=['title', 'headline'])
|
| 308 |
+
snippet_elem = article.find(['p', 'div'], class_=['summary', 'content'])
|
| 309 |
+
|
| 310 |
+
if title_elem:
|
| 311 |
+
url = title_elem.find('a')['href'] if title_elem.find('a') else ''
|
| 312 |
+
if url and not url.startswith('http'):
|
| 313 |
+
url = 'https://www.zdnet.com' + url
|
| 314 |
+
|
| 315 |
+
articles.append({
|
| 316 |
+
'title': title_elem.get_text(strip=True),
|
| 317 |
+
'content': snippet_elem.get_text(strip=True) if snippet_elem else '',
|
| 318 |
+
'url': url,
|
| 319 |
+
'source': 'ZDNet'
|
| 320 |
+
})
|
| 321 |
+
except Exception as e:
|
| 322 |
+
print(f"Error parsing ZDNet article: {str(e)}")
|
| 323 |
+
return articles
|
| 324 |
+
|
| 325 |
+
def _remove_duplicates(self, articles: List[Dict[str, str]]) -> List[Dict[str, str]]:
|
| 326 |
+
"""Remove duplicate articles based on title similarity."""
|
| 327 |
+
unique_articles = []
|
| 328 |
+
seen_titles = set()
|
| 329 |
+
|
| 330 |
+
for article in articles:
|
| 331 |
+
title = article['title'].lower()
|
| 332 |
+
if not any(title in seen_title or seen_title in title for seen_title in seen_titles):
|
| 333 |
+
unique_articles.append(article)
|
| 334 |
+
seen_titles.add(title)
|
| 335 |
+
|
| 336 |
+
return unique_articles
|
| 337 |
+
|
| 338 |
+
class SentimentAnalyzer:
|
| 339 |
+
def __init__(self):
|
| 340 |
+
try:
|
| 341 |
+
# Primary financial sentiment model
|
| 342 |
+
self.sentiment_pipeline = pipeline("sentiment-analysis",
|
| 343 |
+
model=SENTIMENT_MODEL)
|
| 344 |
+
|
| 345 |
+
# Initialize fine-grained sentiment models
|
| 346 |
+
self.fine_grained_models = {}
|
| 347 |
+
try:
|
| 348 |
+
# Initialize the default fine-grained model for backward compatibility
|
| 349 |
+
self.fine_grained_sentiment = pipeline("sentiment-analysis",
|
| 350 |
+
model=SENTIMENT_FINE_GRAINED_MODEL)
|
| 351 |
+
|
| 352 |
+
# Initialize additional fine-grained models
|
| 353 |
+
for model_name, model_path in FINE_GRAINED_MODELS.items():
|
| 354 |
+
try:
|
| 355 |
+
print(f"Loading fine-grained model: {model_name}")
|
| 356 |
+
self.fine_grained_models[model_name] = pipeline("sentiment-analysis",
|
| 357 |
+
model=model_path)
|
| 358 |
+
except Exception as e:
|
| 359 |
+
print(f"Error loading fine-grained model {model_name}: {str(e)}")
|
| 360 |
+
except Exception as e:
|
| 361 |
+
print(f"Error initializing fine-grained models: {str(e)}")
|
| 362 |
+
self.fine_grained_sentiment = None
|
| 363 |
+
|
| 364 |
+
# Initialize additional sentiment analyzers if available
|
| 365 |
+
self.has_textblob = False
|
| 366 |
+
self.has_vader = False
|
| 367 |
+
|
| 368 |
+
try:
|
| 369 |
+
from textblob import TextBlob
|
| 370 |
+
self.TextBlob = TextBlob
|
| 371 |
+
self.has_textblob = True
|
| 372 |
+
except:
|
| 373 |
+
print("TextBlob not available. Install with: pip install textblob")
|
| 374 |
+
|
| 375 |
+
try:
|
| 376 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
| 377 |
+
self.vader = SentimentIntensityAnalyzer()
|
| 378 |
+
self.has_vader = True
|
| 379 |
+
except:
|
| 380 |
+
print("VADER not available. Install with: pip install vaderSentiment")
|
| 381 |
+
|
| 382 |
+
self.summarizer = pipeline("summarization",
|
| 383 |
+
model=SUMMARIZATION_MODEL)
|
| 384 |
+
self.vectorizer = TfidfVectorizer(stop_words='english',
|
| 385 |
+
max_features=10)
|
| 386 |
+
|
| 387 |
+
# Initialize NER pipeline if spaCy is available
|
| 388 |
+
try:
|
| 389 |
+
import spacy
|
| 390 |
+
self.nlp = spacy.load("en_core_web_sm")
|
| 391 |
+
self.has_ner = True
|
| 392 |
+
except:
|
| 393 |
+
self.has_ner = False
|
| 394 |
+
print("spaCy not available for NER. Install with: pip install spacy && python -m spacy download en_core_web_sm")
|
| 395 |
+
|
| 396 |
+
except Exception as e:
|
| 397 |
+
print(f"Error initializing sentiment models: {str(e)}")
|
| 398 |
+
# Fallback to default models if specific models fail
|
| 399 |
+
self.sentiment_pipeline = pipeline("sentiment-analysis")
|
| 400 |
+
self.fine_grained_sentiment = None
|
| 401 |
+
self.fine_grained_models = {}
|
| 402 |
+
self.summarizer = pipeline("summarization")
|
| 403 |
+
self.vectorizer = TfidfVectorizer(stop_words='english', max_features=10)
|
| 404 |
+
self.has_ner = False
|
| 405 |
+
self.has_textblob = False
|
| 406 |
+
self.has_vader = False
|
| 407 |
+
|
| 408 |
+
def analyze_article(self, article: Dict[str, str]) -> Dict[str, Any]:
|
| 409 |
+
"""Analyze sentiment and generate summary for an article."""
|
| 410 |
+
try:
|
| 411 |
+
# Get the full text by combining title and content
|
| 412 |
+
full_text = f"{article['title']} {article['content']}"
|
| 413 |
+
|
| 414 |
+
# Generate summary
|
| 415 |
+
summary = self.summarize_text(full_text)
|
| 416 |
+
|
| 417 |
+
# Get ensemble sentiment analysis
|
| 418 |
+
sentiment_analysis = self._get_ensemble_sentiment(full_text)
|
| 419 |
+
sentiment_label = sentiment_analysis['ensemble_sentiment']
|
| 420 |
+
sentiment_score = sentiment_analysis['ensemble_score']
|
| 421 |
+
|
| 422 |
+
# Add fine-grained sentiment analysis
|
| 423 |
+
fine_grained_sentiment = self._get_fine_grained_sentiment(full_text)
|
| 424 |
+
|
| 425 |
+
# Extract key topics
|
| 426 |
+
topics = self.extract_topics(full_text)
|
| 427 |
+
|
| 428 |
+
# Extract named entities
|
| 429 |
+
entities = self._extract_entities(full_text)
|
| 430 |
+
|
| 431 |
+
# Extract sentiment targets (entities associated with sentiment)
|
| 432 |
+
sentiment_targets = self._extract_sentiment_targets(full_text, entities)
|
| 433 |
+
|
| 434 |
+
# Add analysis to article
|
| 435 |
+
analyzed_article = article.copy()
|
| 436 |
+
analyzed_article.update({
|
| 437 |
+
'summary': summary,
|
| 438 |
+
'sentiment': sentiment_label,
|
| 439 |
+
'sentiment_score': sentiment_score,
|
| 440 |
+
'sentiment_details': sentiment_analysis,
|
| 441 |
+
'fine_grained_sentiment': fine_grained_sentiment,
|
| 442 |
+
'topics': topics,
|
| 443 |
+
'entities': entities,
|
| 444 |
+
'sentiment_targets': sentiment_targets,
|
| 445 |
+
'sentiment_indices': fine_grained_sentiment.get('indices', {}),
|
| 446 |
+
'analysis_timestamp': datetime.now().isoformat()
|
| 447 |
+
})
|
| 448 |
+
|
| 449 |
+
return analyzed_article
|
| 450 |
+
|
| 451 |
+
except Exception as e:
|
| 452 |
+
print(f"Error analyzing article: {str(e)}")
|
| 453 |
+
# Return original article with default values if analysis fails
|
| 454 |
+
article.update({
|
| 455 |
+
'summary': article.get('content', '')[:200] + '...',
|
| 456 |
+
'sentiment': 'neutral',
|
| 457 |
+
'sentiment_score': 0.0,
|
| 458 |
+
'sentiment_details': {},
|
| 459 |
+
'fine_grained_sentiment': {},
|
| 460 |
+
'topics': [],
|
| 461 |
+
'entities': {},
|
| 462 |
+
'sentiment_targets': [],
|
| 463 |
+
'sentiment_indices': {
|
| 464 |
+
'positivity_index': 0.5,
|
| 465 |
+
'negativity_index': 0.5,
|
| 466 |
+
'emotional_intensity': 0.0,
|
| 467 |
+
'controversy_score': 0.0,
|
| 468 |
+
'confidence_score': 0.0,
|
| 469 |
+
'esg_relevance': 0.0
|
| 470 |
+
},
|
| 471 |
+
'analysis_timestamp': datetime.now().isoformat()
|
| 472 |
+
})
|
| 473 |
+
return article
|
| 474 |
+
|
| 475 |
+
def _get_ensemble_sentiment(self, text: str) -> Dict[str, Any]:
|
| 476 |
+
"""Get ensemble sentiment by combining multiple sentiment models."""
|
| 477 |
+
results = {}
|
| 478 |
+
|
| 479 |
+
# Initialize with default values
|
| 480 |
+
ensemble_result = {
|
| 481 |
+
'ensemble_sentiment': 'neutral',
|
| 482 |
+
'ensemble_score': 0.5,
|
| 483 |
+
'models': {}
|
| 484 |
+
}
|
| 485 |
+
|
| 486 |
+
try:
|
| 487 |
+
# 1. Primary transformer model (finbert)
|
| 488 |
+
try:
|
| 489 |
+
primary_result = self.sentiment_pipeline(text[:512])[0] # Limit text length
|
| 490 |
+
primary_label = primary_result['label'].lower()
|
| 491 |
+
primary_score = primary_result['score']
|
| 492 |
+
|
| 493 |
+
# Map to standard format
|
| 494 |
+
if primary_label == 'positive':
|
| 495 |
+
primary_normalized = primary_score
|
| 496 |
+
elif primary_label == 'negative':
|
| 497 |
+
primary_normalized = 1 - primary_score
|
| 498 |
+
else: # neutral
|
| 499 |
+
primary_normalized = 0.5
|
| 500 |
+
|
| 501 |
+
ensemble_result['models']['transformer'] = {
|
| 502 |
+
'sentiment': primary_label,
|
| 503 |
+
'score': round(primary_score, 3),
|
| 504 |
+
'normalized_score': round(primary_normalized, 3)
|
| 505 |
+
}
|
| 506 |
+
except:
|
| 507 |
+
ensemble_result['models']['transformer'] = {
|
| 508 |
+
'sentiment': 'error',
|
| 509 |
+
'score': 0,
|
| 510 |
+
'normalized_score': 0.5
|
| 511 |
+
}
|
| 512 |
+
|
| 513 |
+
# 2. TextBlob sentiment
|
| 514 |
+
if self.has_textblob:
|
| 515 |
+
try:
|
| 516 |
+
blob = self.TextBlob(text)
|
| 517 |
+
polarity = blob.sentiment.polarity
|
| 518 |
+
|
| 519 |
+
# Convert to standard format
|
| 520 |
+
if polarity > 0.1:
|
| 521 |
+
textblob_sentiment = 'positive'
|
| 522 |
+
textblob_score = polarity
|
| 523 |
+
elif polarity < -0.1:
|
| 524 |
+
textblob_sentiment = 'negative'
|
| 525 |
+
textblob_score = abs(polarity)
|
| 526 |
+
else:
|
| 527 |
+
textblob_sentiment = 'neutral'
|
| 528 |
+
textblob_score = 0.5
|
| 529 |
+
|
| 530 |
+
# Normalize to 0-1 scale
|
| 531 |
+
textblob_normalized = (polarity + 1) / 2
|
| 532 |
+
|
| 533 |
+
ensemble_result['models']['textblob'] = {
|
| 534 |
+
'sentiment': textblob_sentiment,
|
| 535 |
+
'score': round(textblob_score, 3),
|
| 536 |
+
'normalized_score': round(textblob_normalized, 3)
|
| 537 |
+
}
|
| 538 |
+
except:
|
| 539 |
+
ensemble_result['models']['textblob'] = {
|
| 540 |
+
'sentiment': 'error',
|
| 541 |
+
'score': 0,
|
| 542 |
+
'normalized_score': 0.5
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
# 3. VADER sentiment
|
| 546 |
+
if self.has_vader:
|
| 547 |
+
try:
|
| 548 |
+
vader_scores = self.vader.polarity_scores(text)
|
| 549 |
+
compound = vader_scores['compound']
|
| 550 |
+
|
| 551 |
+
# Convert to standard format
|
| 552 |
+
if compound > 0.05:
|
| 553 |
+
vader_sentiment = 'positive'
|
| 554 |
+
vader_score = compound
|
| 555 |
+
elif compound < -0.05:
|
| 556 |
+
vader_sentiment = 'negative'
|
| 557 |
+
vader_score = abs(compound)
|
| 558 |
+
else:
|
| 559 |
+
vader_sentiment = 'neutral'
|
| 560 |
+
vader_score = 0.5
|
| 561 |
+
|
| 562 |
+
# Normalize to 0-1 scale
|
| 563 |
+
vader_normalized = (compound + 1) / 2
|
| 564 |
+
|
| 565 |
+
ensemble_result['models']['vader'] = {
|
| 566 |
+
'sentiment': vader_sentiment,
|
| 567 |
+
'score': round(vader_score, 3),
|
| 568 |
+
'normalized_score': round(vader_normalized, 3)
|
| 569 |
+
}
|
| 570 |
+
except:
|
| 571 |
+
ensemble_result['models']['vader'] = {
|
| 572 |
+
'sentiment': 'error',
|
| 573 |
+
'score': 0,
|
| 574 |
+
'normalized_score': 0.5
|
| 575 |
+
}
|
| 576 |
+
|
| 577 |
+
# Calculate ensemble result
|
| 578 |
+
# Get all normalized scores
|
| 579 |
+
normalized_scores = []
|
| 580 |
+
for model_name, model_result in ensemble_result['models'].items():
|
| 581 |
+
if model_result['sentiment'] != 'error':
|
| 582 |
+
normalized_scores.append(model_result['normalized_score'])
|
| 583 |
+
|
| 584 |
+
# Calculate average if we have scores
|
| 585 |
+
if normalized_scores:
|
| 586 |
+
avg_score = sum(normalized_scores) / len(normalized_scores)
|
| 587 |
+
|
| 588 |
+
# Convert to sentiment label
|
| 589 |
+
if avg_score > 0.6:
|
| 590 |
+
ensemble_sentiment = 'positive'
|
| 591 |
+
elif avg_score < 0.4:
|
| 592 |
+
ensemble_sentiment = 'negative'
|
| 593 |
+
else:
|
| 594 |
+
ensemble_sentiment = 'neutral'
|
| 595 |
+
|
| 596 |
+
ensemble_result['ensemble_sentiment'] = ensemble_sentiment
|
| 597 |
+
ensemble_result['ensemble_score'] = round(avg_score, 3)
|
| 598 |
+
|
| 599 |
+
# Add confidence level
|
| 600 |
+
if len(normalized_scores) > 1:
|
| 601 |
+
# Calculate standard deviation to measure agreement
|
| 602 |
+
std_dev = statistics.stdev(normalized_scores) if len(normalized_scores) > 1 else 0
|
| 603 |
+
agreement = 1 - (std_dev * 2) # Lower std_dev means higher agreement
|
| 604 |
+
agreement = max(0, min(1, agreement)) # Clamp to 0-1
|
| 605 |
+
|
| 606 |
+
ensemble_result['model_agreement'] = round(agreement, 3)
|
| 607 |
+
|
| 608 |
+
return ensemble_result
|
| 609 |
+
|
| 610 |
+
except Exception as e:
|
| 611 |
+
print(f"Error in ensemble sentiment analysis: {str(e)}")
|
| 612 |
+
return {
|
| 613 |
+
'ensemble_sentiment': 'neutral',
|
| 614 |
+
'ensemble_score': 0.5,
|
| 615 |
+
'models': {}
|
| 616 |
+
}
|
| 617 |
+
|
| 618 |
+
def _get_fine_grained_sentiment(self, text: str) -> Dict[str, Any]:
|
| 619 |
+
"""Get fine-grained sentiment analysis with more detailed categories."""
|
| 620 |
+
# Initialize result structure
|
| 621 |
+
result = {
|
| 622 |
+
"primary": {"category": "unknown", "confidence": 0.0},
|
| 623 |
+
"models": {}
|
| 624 |
+
}
|
| 625 |
+
|
| 626 |
+
# Check if we have any fine-grained models
|
| 627 |
+
if not self.fine_grained_sentiment and not self.fine_grained_models:
|
| 628 |
+
return result
|
| 629 |
+
|
| 630 |
+
try:
|
| 631 |
+
# Split text into manageable chunks if too long
|
| 632 |
+
chunks = self._split_text(text)
|
| 633 |
+
|
| 634 |
+
# Process with default fine-grained model for backward compatibility
|
| 635 |
+
if self.fine_grained_sentiment:
|
| 636 |
+
primary_results = []
|
| 637 |
+
|
| 638 |
+
for chunk in chunks:
|
| 639 |
+
if not chunk.strip():
|
| 640 |
+
continue
|
| 641 |
+
chunk_result = self.fine_grained_sentiment(chunk)[0]
|
| 642 |
+
primary_results.append(chunk_result)
|
| 643 |
+
|
| 644 |
+
if primary_results:
|
| 645 |
+
# Aggregate results from all chunks
|
| 646 |
+
categories = {}
|
| 647 |
+
for res in primary_results:
|
| 648 |
+
label = res['label'].lower()
|
| 649 |
+
score = res['score']
|
| 650 |
+
if label in categories:
|
| 651 |
+
categories[label] += score
|
| 652 |
+
else:
|
| 653 |
+
categories[label] = score
|
| 654 |
+
|
| 655 |
+
# Normalize scores
|
| 656 |
+
total = sum(categories.values())
|
| 657 |
+
if total > 0:
|
| 658 |
+
categories = {k: round(v/total, 3) for k, v in categories.items()}
|
| 659 |
+
|
| 660 |
+
# Get dominant category
|
| 661 |
+
dominant_category = max(categories.items(), key=lambda x: x[1])
|
| 662 |
+
|
| 663 |
+
result["primary"] = {
|
| 664 |
+
"category": dominant_category[0],
|
| 665 |
+
"confidence": dominant_category[1],
|
| 666 |
+
"distribution": categories
|
| 667 |
+
}
|
| 668 |
+
|
| 669 |
+
# Process with additional fine-grained models
|
| 670 |
+
for model_name, model in self.fine_grained_models.items():
|
| 671 |
+
model_results = []
|
| 672 |
+
|
| 673 |
+
for chunk in chunks:
|
| 674 |
+
if not chunk.strip():
|
| 675 |
+
continue
|
| 676 |
+
try:
|
| 677 |
+
chunk_result = model(chunk)[0]
|
| 678 |
+
model_results.append(chunk_result)
|
| 679 |
+
except Exception as e:
|
| 680 |
+
print(f"Error analyzing chunk with model {model_name}: {str(e)}")
|
| 681 |
+
|
| 682 |
+
if model_results:
|
| 683 |
+
# Aggregate results from all chunks
|
| 684 |
+
categories = {}
|
| 685 |
+
for res in model_results:
|
| 686 |
+
# Ensure the label is lowercase for consistency
|
| 687 |
+
label = res['label'].lower() if isinstance(res.get('label'), str) else "unknown"
|
| 688 |
+
score = res['score']
|
| 689 |
+
if label in categories:
|
| 690 |
+
categories[label] += score
|
| 691 |
+
else:
|
| 692 |
+
categories[label] = score
|
| 693 |
+
|
| 694 |
+
# Normalize scores
|
| 695 |
+
total = sum(categories.values())
|
| 696 |
+
if total > 0:
|
| 697 |
+
categories = {k: round(v/total, 3) for k, v in categories.items()}
|
| 698 |
+
|
| 699 |
+
# Get dominant category
|
| 700 |
+
dominant_category = max(categories.items(), key=lambda x: x[1])
|
| 701 |
+
|
| 702 |
+
# Store results for this model
|
| 703 |
+
result["models"][model_name] = {
|
| 704 |
+
"category": dominant_category[0],
|
| 705 |
+
"confidence": dominant_category[1],
|
| 706 |
+
"distribution": categories
|
| 707 |
+
}
|
| 708 |
+
|
| 709 |
+
# Calculate sentiment indices based on the fine-grained results
|
| 710 |
+
result["indices"] = self._calculate_sentiment_indices(result)
|
| 711 |
+
|
| 712 |
+
return result
|
| 713 |
+
|
| 714 |
+
except Exception as e:
|
| 715 |
+
print(f"Error in fine-grained sentiment analysis: {str(e)}")
|
| 716 |
+
return result
|
| 717 |
+
|
| 718 |
+
def _calculate_sentiment_indices(self, fine_grained_results: Dict[str, Any]) -> Dict[str, float]:
|
| 719 |
+
"""Calculate various sentiment indices based on fine-grained sentiment analysis."""
|
| 720 |
+
indices = {
|
| 721 |
+
"positivity_index": 0.5, # Default neutral value
|
| 722 |
+
"negativity_index": 0.5,
|
| 723 |
+
"emotional_intensity": 0.0,
|
| 724 |
+
"controversy_score": 0.0,
|
| 725 |
+
"confidence_score": 0.0,
|
| 726 |
+
"esg_relevance": 0.0
|
| 727 |
+
}
|
| 728 |
+
|
| 729 |
+
try:
|
| 730 |
+
# Extract distributions from all models
|
| 731 |
+
distributions = {}
|
| 732 |
+
confidence_scores = {}
|
| 733 |
+
|
| 734 |
+
# Add primary model if available
|
| 735 |
+
if "category" in fine_grained_results.get("primary", {}):
|
| 736 |
+
if "distribution" in fine_grained_results["primary"]:
|
| 737 |
+
distributions["primary"] = fine_grained_results["primary"]["distribution"]
|
| 738 |
+
confidence_scores["primary"] = fine_grained_results["primary"].get("confidence", 0.0)
|
| 739 |
+
|
| 740 |
+
# Add other models
|
| 741 |
+
for model_name, model_result in fine_grained_results.get("models", {}).items():
|
| 742 |
+
if "distribution" in model_result:
|
| 743 |
+
distributions[model_name] = model_result["distribution"]
|
| 744 |
+
confidence_scores[model_name] = model_result.get("confidence", 0.0)
|
| 745 |
+
|
| 746 |
+
# Calculate positivity index
|
| 747 |
+
positive_scores = []
|
| 748 |
+
for model_name, dist in distributions.items():
|
| 749 |
+
if model_name == "financial" or model_name == "primary" or model_name == "news_tone" or model_name == "aspect":
|
| 750 |
+
pos_score = dist.get("positive", 0.0)
|
| 751 |
+
positive_scores.append(pos_score)
|
| 752 |
+
elif model_name == "emotion":
|
| 753 |
+
# For emotion model, consider joy as positive
|
| 754 |
+
pos_score = dist.get("joy", 0.0) + dist.get("surprise", 0.0) * 0.5
|
| 755 |
+
positive_scores.append(pos_score)
|
| 756 |
+
|
| 757 |
+
if positive_scores:
|
| 758 |
+
indices["positivity_index"] = round(sum(positive_scores) / len(positive_scores), 3)
|
| 759 |
+
|
| 760 |
+
# Calculate negativity index
|
| 761 |
+
negative_scores = []
|
| 762 |
+
for model_name, dist in distributions.items():
|
| 763 |
+
if model_name == "financial" or model_name == "primary" or model_name == "news_tone" or model_name == "aspect":
|
| 764 |
+
neg_score = dist.get("negative", 0.0)
|
| 765 |
+
negative_scores.append(neg_score)
|
| 766 |
+
elif model_name == "emotion":
|
| 767 |
+
# For emotion model, consider sadness, anger, fear, disgust as negative
|
| 768 |
+
neg_score = dist.get("sadness", 0.0) + dist.get("anger", 0.0) + \
|
| 769 |
+
dist.get("fear", 0.0) + dist.get("disgust", 0.0)
|
| 770 |
+
negative_scores.append(neg_score / 4) # Average of 4 negative emotions
|
| 771 |
+
|
| 772 |
+
if negative_scores:
|
| 773 |
+
indices["negativity_index"] = round(sum(negative_scores) / len(negative_scores), 3)
|
| 774 |
+
|
| 775 |
+
# Calculate emotional intensity
|
| 776 |
+
emotion_dist = distributions.get("emotion", {})
|
| 777 |
+
if emotion_dist:
|
| 778 |
+
# Sum all emotional intensities except neutral
|
| 779 |
+
emotional_sum = sum(v for k, v in emotion_dist.items() if k != "neutral")
|
| 780 |
+
indices["emotional_intensity"] = round(emotional_sum, 3)
|
| 781 |
+
|
| 782 |
+
# Calculate controversy score (high when both positive and negative are high)
|
| 783 |
+
indices["controversy_score"] = round(indices["positivity_index"] * indices["negativity_index"] * 4, 3)
|
| 784 |
+
|
| 785 |
+
# Calculate confidence score (average of all model confidences)
|
| 786 |
+
if confidence_scores:
|
| 787 |
+
indices["confidence_score"] = round(sum(confidence_scores.values()) / len(confidence_scores), 3)
|
| 788 |
+
|
| 789 |
+
# Calculate ESG relevance if available
|
| 790 |
+
esg_dist = distributions.get("esg", {})
|
| 791 |
+
if esg_dist:
|
| 792 |
+
# Sum of all ESG categories
|
| 793 |
+
esg_sum = sum(v for k, v in esg_dist.items() if k in ["environmental", "social", "governance"])
|
| 794 |
+
indices["esg_relevance"] = round(esg_sum, 3)
|
| 795 |
+
|
| 796 |
+
return indices
|
| 797 |
+
|
| 798 |
+
except Exception as e:
|
| 799 |
+
print(f"Error calculating sentiment indices: {str(e)}")
|
| 800 |
+
return indices
|
| 801 |
+
|
| 802 |
+
def summarize_text(self, text: str) -> str:
|
| 803 |
+
"""Generate a concise summary of the text."""
|
| 804 |
+
try:
|
| 805 |
+
# Clean and prepare text
|
| 806 |
+
text = text.replace('\n', ' ').strip()
|
| 807 |
+
|
| 808 |
+
# Split text into chunks if it's too long
|
| 809 |
+
chunks = self._split_text(text)
|
| 810 |
+
|
| 811 |
+
summaries = []
|
| 812 |
+
for chunk in chunks:
|
| 813 |
+
# Generate summary for each chunk
|
| 814 |
+
summary = self.summarizer(chunk,
|
| 815 |
+
max_length=130,
|
| 816 |
+
min_length=30,
|
| 817 |
+
do_sample=False)[0]['summary_text']
|
| 818 |
+
summaries.append(summary)
|
| 819 |
+
|
| 820 |
+
# Combine summaries if there were multiple chunks
|
| 821 |
+
final_summary = ' '.join(summaries)
|
| 822 |
+
return final_summary
|
| 823 |
+
|
| 824 |
+
except Exception as e:
|
| 825 |
+
print(f"Error generating summary: {str(e)}")
|
| 826 |
+
return text[:200] + '...' # Return truncated text as fallback
|
| 827 |
+
|
| 828 |
+
def extract_topics(self, text: str) -> List[str]:
|
| 829 |
+
"""Extract key topics from the text using TF-IDF."""
|
| 830 |
+
try:
|
| 831 |
+
# Prepare text
|
| 832 |
+
text = text.lower()
|
| 833 |
+
|
| 834 |
+
# Fit and transform the text
|
| 835 |
+
tfidf_matrix = self.vectorizer.fit_transform([text])
|
| 836 |
+
|
| 837 |
+
# Get feature names and scores
|
| 838 |
+
feature_names = self.vectorizer.get_feature_names_out()
|
| 839 |
+
scores = tfidf_matrix.toarray()[0]
|
| 840 |
+
|
| 841 |
+
# Get top topics
|
| 842 |
+
top_indices = scores.argsort()[-5:][::-1] # Get top 5 topics
|
| 843 |
+
topics = [feature_names[i] for i in top_indices]
|
| 844 |
+
|
| 845 |
+
return topics
|
| 846 |
+
|
| 847 |
+
except Exception as e:
|
| 848 |
+
print(f"Error extracting topics: {str(e)}")
|
| 849 |
+
return []
|
| 850 |
+
|
| 851 |
+
def _split_text(self, text: str, max_length: int = 1024) -> List[str]:
|
| 852 |
+
"""Split text into chunks that fit within model's maximum token limit."""
|
| 853 |
+
words = text.split()
|
| 854 |
+
chunks = []
|
| 855 |
+
current_chunk = []
|
| 856 |
+
current_length = 0
|
| 857 |
+
|
| 858 |
+
for word in words:
|
| 859 |
+
word_length = len(word) + 1 # +1 for space
|
| 860 |
+
if current_length + word_length > max_length:
|
| 861 |
+
chunks.append(' '.join(current_chunk))
|
| 862 |
+
current_chunk = [word]
|
| 863 |
+
current_length = word_length
|
| 864 |
+
else:
|
| 865 |
+
current_chunk.append(word)
|
| 866 |
+
current_length += word_length
|
| 867 |
+
|
| 868 |
+
if current_chunk:
|
| 869 |
+
chunks.append(' '.join(current_chunk))
|
| 870 |
+
|
| 871 |
+
return chunks
|
| 872 |
+
|
| 873 |
+
def _extract_entities(self, text: str) -> Dict[str, List[str]]:
|
| 874 |
+
"""Extract named entities from text."""
|
| 875 |
+
entities = {
|
| 876 |
+
'PERSON': [],
|
| 877 |
+
'ORG': [],
|
| 878 |
+
'GPE': [], # Countries, cities, states
|
| 879 |
+
'MONEY': [],
|
| 880 |
+
'PERCENT': [],
|
| 881 |
+
'DATE': []
|
| 882 |
+
}
|
| 883 |
+
|
| 884 |
+
if not self.has_ner:
|
| 885 |
+
return entities
|
| 886 |
+
|
| 887 |
+
try:
|
| 888 |
+
# Process text with spaCy
|
| 889 |
+
doc = self.nlp(text[:10000]) # Limit text length for performance
|
| 890 |
+
|
| 891 |
+
# Extract entities
|
| 892 |
+
for ent in doc.ents:
|
| 893 |
+
if ent.label_ in entities:
|
| 894 |
+
# Clean entity text and deduplicate
|
| 895 |
+
clean_text = ent.text.strip()
|
| 896 |
+
if clean_text and clean_text not in entities[ent.label_]:
|
| 897 |
+
entities[ent.label_].append(clean_text)
|
| 898 |
+
|
| 899 |
+
return entities
|
| 900 |
+
except Exception as e:
|
| 901 |
+
print(f"Error extracting entities: {str(e)}")
|
| 902 |
+
return entities
|
| 903 |
+
|
| 904 |
+
def _extract_sentiment_targets(self, text: str, entities: Dict[str, List[str]]) -> List[Dict[str, Any]]:
|
| 905 |
+
"""Extract entities that are targets of sentiment expressions."""
|
| 906 |
+
if not self.has_ner:
|
| 907 |
+
return []
|
| 908 |
+
|
| 909 |
+
try:
|
| 910 |
+
# Get all entities as a flat list
|
| 911 |
+
all_entities = []
|
| 912 |
+
for entity_type, entity_list in entities.items():
|
| 913 |
+
for entity in entity_list:
|
| 914 |
+
all_entities.append({
|
| 915 |
+
'text': entity,
|
| 916 |
+
'type': entity_type
|
| 917 |
+
})
|
| 918 |
+
|
| 919 |
+
# Find sentiment targets
|
| 920 |
+
targets = []
|
| 921 |
+
|
| 922 |
+
# Split text into sentences
|
| 923 |
+
doc = self.nlp(text[:10000]) # Limit text length
|
| 924 |
+
|
| 925 |
+
for sentence in doc.sents:
|
| 926 |
+
# Skip short sentences
|
| 927 |
+
if len(sentence.text.split()) < 3:
|
| 928 |
+
continue
|
| 929 |
+
|
| 930 |
+
# Check for sentiment in this sentence
|
| 931 |
+
try:
|
| 932 |
+
sentiment = self.sentiment_pipeline(sentence.text)[0]
|
| 933 |
+
# Only process if sentiment is strong
|
| 934 |
+
if sentiment['score'] > 0.7:
|
| 935 |
+
# Find entities in this sentence
|
| 936 |
+
for entity in all_entities:
|
| 937 |
+
if entity['text'] in sentence.text:
|
| 938 |
+
targets.append({
|
| 939 |
+
'entity': entity['text'],
|
| 940 |
+
'type': entity['type'],
|
| 941 |
+
'sentiment': sentiment['label'].lower(),
|
| 942 |
+
'confidence': round(sentiment['score'], 3),
|
| 943 |
+
'context': sentence.text
|
| 944 |
+
})
|
| 945 |
+
except:
|
| 946 |
+
continue
|
| 947 |
+
|
| 948 |
+
# Return unique targets
|
| 949 |
+
unique_targets = []
|
| 950 |
+
seen = set()
|
| 951 |
+
for target in targets:
|
| 952 |
+
key = f"{target['entity']}_{target['sentiment']}"
|
| 953 |
+
if key not in seen:
|
| 954 |
+
seen.add(key)
|
| 955 |
+
unique_targets.append(target)
|
| 956 |
+
|
| 957 |
+
return unique_targets
|
| 958 |
+
|
| 959 |
+
except Exception as e:
|
| 960 |
+
print(f"Error extracting sentiment targets: {str(e)}")
|
| 961 |
+
return []
|
| 962 |
+
|
| 963 |
+
class TextToSpeechConverter:
|
| 964 |
+
def __init__(self):
|
| 965 |
+
self.output_dir = AUDIO_OUTPUT_DIR
|
| 966 |
+
self.translator = Translator()
|
| 967 |
+
os.makedirs(self.output_dir, exist_ok=True)
|
| 968 |
+
|
| 969 |
+
def generate_audio(self, text: str, filename: str) -> str:
|
| 970 |
+
"""Convert text to Hindi speech and save as audio file."""
|
| 971 |
+
try:
|
| 972 |
+
print(f"Translating text to Hindi: {text[:100]}...")
|
| 973 |
+
|
| 974 |
+
# First translate the text to Hindi
|
| 975 |
+
# Use chunking for long text to avoid translation limits
|
| 976 |
+
chunks = []
|
| 977 |
+
for i in range(0, len(text), 1000):
|
| 978 |
+
chunk = text[i:i+1000]
|
| 979 |
+
try:
|
| 980 |
+
translated_chunk = self.translator.translate(chunk, dest='hi').text
|
| 981 |
+
chunks.append(translated_chunk)
|
| 982 |
+
print(f"Translated chunk {i//1000 + 1}")
|
| 983 |
+
except Exception as e:
|
| 984 |
+
print(f"Error translating chunk {i//1000 + 1}: {str(e)}")
|
| 985 |
+
# If translation fails, use original text
|
| 986 |
+
chunks.append(chunk)
|
| 987 |
+
|
| 988 |
+
hindi_text = ' '.join(chunks)
|
| 989 |
+
print(f"Translation complete. Hindi text length: {len(hindi_text)}")
|
| 990 |
+
|
| 991 |
+
# Generate Hindi speech
|
| 992 |
+
print("Generating Hindi speech...")
|
| 993 |
+
tts = gTTS(text=hindi_text, lang='hi', slow=False)
|
| 994 |
+
output_path = os.path.join(self.output_dir, f"{filename}.mp3")
|
| 995 |
+
tts.save(output_path)
|
| 996 |
+
print(f"Audio saved to {output_path}")
|
| 997 |
+
|
| 998 |
+
return output_path
|
| 999 |
+
except Exception as e:
|
| 1000 |
+
print(f"Error in TTS conversion: {str(e)}")
|
| 1001 |
+
# Fallback to original text if translation fails
|
| 1002 |
+
print("Using fallback English TTS")
|
| 1003 |
+
tts = gTTS(text=text, lang='en')
|
| 1004 |
+
output_path = os.path.join(self.output_dir, f"{filename}.mp3")
|
| 1005 |
+
tts.save(output_path)
|
| 1006 |
+
return output_path
|
| 1007 |
+
|
| 1008 |
+
class ComparativeAnalyzer:
|
| 1009 |
+
def __init__(self):
|
| 1010 |
+
pass
|
| 1011 |
+
|
| 1012 |
+
def analyze_coverage(self, articles: List[Dict[str, Any]], company_name: str = None) -> Dict[str, Any]:
|
| 1013 |
+
"""Perform comparative analysis across articles."""
|
| 1014 |
+
if not articles:
|
| 1015 |
+
return {
|
| 1016 |
+
"topics": [],
|
| 1017 |
+
"sentiment_distribution": {},
|
| 1018 |
+
"coverage_differences": ["No articles found for analysis."],
|
| 1019 |
+
"final_sentiment": "No articles found for analysis.",
|
| 1020 |
+
"total_articles": 0,
|
| 1021 |
+
"sentiment_indices": {}
|
| 1022 |
+
}
|
| 1023 |
+
|
| 1024 |
+
# Debug: Print articles for analysis
|
| 1025 |
+
print(f"Analyzing {len(articles)} articles for company: {company_name}")
|
| 1026 |
+
|
| 1027 |
+
# Add company name to each article if provided
|
| 1028 |
+
if company_name:
|
| 1029 |
+
for article in articles:
|
| 1030 |
+
article['company'] = company_name
|
| 1031 |
+
|
| 1032 |
+
# Calculate sentiment distribution
|
| 1033 |
+
print("Calculating sentiment distribution...")
|
| 1034 |
+
sentiment_dist = self._get_sentiment_distribution(articles)
|
| 1035 |
+
print("Sentiment distribution result:")
|
| 1036 |
+
print(sentiment_dist)
|
| 1037 |
+
|
| 1038 |
+
# Analyze common topics
|
| 1039 |
+
topics = self._analyze_topics(articles)
|
| 1040 |
+
|
| 1041 |
+
# Analyze coverage differences
|
| 1042 |
+
differences = self._analyze_coverage_differences(articles)
|
| 1043 |
+
|
| 1044 |
+
# Get final sentiment analysis
|
| 1045 |
+
final_sentiment = self._get_final_sentiment(sentiment_dist, articles)
|
| 1046 |
+
|
| 1047 |
+
result = {
|
| 1048 |
+
"topics": topics,
|
| 1049 |
+
"sentiment_distribution": sentiment_dist,
|
| 1050 |
+
"coverage_differences": differences,
|
| 1051 |
+
"final_sentiment": final_sentiment,
|
| 1052 |
+
"total_articles": len(articles),
|
| 1053 |
+
"sentiment_indices": sentiment_dist.get("sentiment_indices", {})
|
| 1054 |
+
}
|
| 1055 |
+
|
| 1056 |
+
# Debug: Print final result
|
| 1057 |
+
print("Final comparative analysis result:")
|
| 1058 |
+
print(result)
|
| 1059 |
+
|
| 1060 |
+
return result
|
| 1061 |
+
|
| 1062 |
+
def _get_sentiment_distribution(self, articles: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 1063 |
+
"""Calculate distribution of sentiments across articles."""
|
| 1064 |
+
# Basic sentiment distribution
|
| 1065 |
+
basic_distribution = {'positive': 0, 'negative': 0, 'neutral': 0}
|
| 1066 |
+
|
| 1067 |
+
# Fine-grained sentiment distribution
|
| 1068 |
+
fine_grained_distribution = {}
|
| 1069 |
+
|
| 1070 |
+
# Sentiment scores
|
| 1071 |
+
sentiment_scores = []
|
| 1072 |
+
|
| 1073 |
+
# Sentiment indices aggregation
|
| 1074 |
+
sentiment_indices = {
|
| 1075 |
+
"positivity_index": [],
|
| 1076 |
+
"negativity_index": [],
|
| 1077 |
+
"emotional_intensity": [],
|
| 1078 |
+
"controversy_score": [],
|
| 1079 |
+
"confidence_score": [],
|
| 1080 |
+
"esg_relevance": []
|
| 1081 |
+
}
|
| 1082 |
+
|
| 1083 |
+
# Debug: Print articles for sentiment distribution
|
| 1084 |
+
print(f"Processing {len(articles)} articles for sentiment distribution")
|
| 1085 |
+
|
| 1086 |
+
# Process each article
|
| 1087 |
+
for i, article in enumerate(articles):
|
| 1088 |
+
try:
|
| 1089 |
+
# Debug: Print article sentiment data
|
| 1090 |
+
print(f"Article {i+1} sentiment data:")
|
| 1091 |
+
print(f" Basic sentiment: {article.get('sentiment', 'N/A')}")
|
| 1092 |
+
print(f" Fine-grained: {article.get('fine_grained_sentiment', {})}")
|
| 1093 |
+
print(f" Sentiment indices: {article.get('sentiment_indices', {})}")
|
| 1094 |
+
|
| 1095 |
+
# Basic sentiment
|
| 1096 |
+
sentiment = article.get('sentiment', 'neutral')
|
| 1097 |
+
if isinstance(sentiment, str):
|
| 1098 |
+
sentiment = sentiment.lower()
|
| 1099 |
+
# Ensure we have a valid sentiment category
|
| 1100 |
+
if sentiment not in basic_distribution:
|
| 1101 |
+
sentiment = 'neutral'
|
| 1102 |
+
basic_distribution[sentiment] = basic_distribution.get(sentiment, 0) + 1
|
| 1103 |
+
else:
|
| 1104 |
+
# Handle non-string sentiment values
|
| 1105 |
+
basic_distribution['neutral'] = basic_distribution.get('neutral', 0) + 1
|
| 1106 |
+
|
| 1107 |
+
# Sentiment score
|
| 1108 |
+
score = article.get('sentiment_score', 0.0)
|
| 1109 |
+
if isinstance(score, (int, float)):
|
| 1110 |
+
sentiment_scores.append(score)
|
| 1111 |
+
|
| 1112 |
+
# Fine-grained sentiment
|
| 1113 |
+
fine_grained = article.get('fine_grained_sentiment', {})
|
| 1114 |
+
if isinstance(fine_grained, dict) and 'category' in fine_grained:
|
| 1115 |
+
category = fine_grained['category']
|
| 1116 |
+
if isinstance(category, str):
|
| 1117 |
+
category = category.lower()
|
| 1118 |
+
fine_grained_distribution[category] = fine_grained_distribution.get(category, 0) + 1
|
| 1119 |
+
|
| 1120 |
+
# Collect sentiment indices
|
| 1121 |
+
indices = article.get('sentiment_indices', {})
|
| 1122 |
+
if isinstance(indices, dict):
|
| 1123 |
+
for index_name, index_values in sentiment_indices.items():
|
| 1124 |
+
if index_name in indices and isinstance(indices[index_name], (int, float)):
|
| 1125 |
+
index_values.append(indices[index_name])
|
| 1126 |
+
except Exception as e:
|
| 1127 |
+
print(f"Error processing article {i+1} for sentiment distribution: {str(e)}")
|
| 1128 |
+
# Continue with next article
|
| 1129 |
+
continue
|
| 1130 |
+
|
| 1131 |
+
# Debug: Print collected data
|
| 1132 |
+
print("Collected sentiment data:")
|
| 1133 |
+
print(f" Basic distribution: {basic_distribution}")
|
| 1134 |
+
print(f" Fine-grained distribution: {fine_grained_distribution}")
|
| 1135 |
+
print(f" Sentiment scores: {sentiment_scores}")
|
| 1136 |
+
print(f" Sentiment indices collected: {sentiment_indices}")
|
| 1137 |
+
|
| 1138 |
+
# Calculate average sentiment score with fallback
|
| 1139 |
+
avg_sentiment_score = 0.5 # Default neutral value
|
| 1140 |
+
if sentiment_scores:
|
| 1141 |
+
avg_sentiment_score = sum(sentiment_scores) / len(sentiment_scores)
|
| 1142 |
+
|
| 1143 |
+
# Calculate sentiment volatility (standard deviation) with fallback
|
| 1144 |
+
sentiment_volatility = 0
|
| 1145 |
+
if len(sentiment_scores) > 1:
|
| 1146 |
+
try:
|
| 1147 |
+
sentiment_volatility = statistics.stdev(sentiment_scores)
|
| 1148 |
+
except Exception as e:
|
| 1149 |
+
print(f"Error calculating sentiment volatility: {str(e)}")
|
| 1150 |
+
|
| 1151 |
+
# Calculate average sentiment indices with fallbacks
|
| 1152 |
+
avg_indices = {}
|
| 1153 |
+
for index_name, values in sentiment_indices.items():
|
| 1154 |
+
if values:
|
| 1155 |
+
avg_indices[index_name] = round(sum(values) / len(values), 3)
|
| 1156 |
+
else:
|
| 1157 |
+
# Provide default values for empty indices
|
| 1158 |
+
if index_name in ["positivity_index", "confidence_score"]:
|
| 1159 |
+
avg_indices[index_name] = 0.5 # Neutral default
|
| 1160 |
+
else:
|
| 1161 |
+
avg_indices[index_name] = 0.0 # Zero default for other indices
|
| 1162 |
+
|
| 1163 |
+
# Ensure all expected indices exist
|
| 1164 |
+
for index_name in ["positivity_index", "negativity_index", "emotional_intensity",
|
| 1165 |
+
"controversy_score", "confidence_score", "esg_relevance"]:
|
| 1166 |
+
if index_name not in avg_indices:
|
| 1167 |
+
avg_indices[index_name] = 0.5 if index_name in ["positivity_index", "confidence_score"] else 0.0
|
| 1168 |
+
|
| 1169 |
+
# Ensure we have at least one item in each distribution
|
| 1170 |
+
if not any(basic_distribution.values()):
|
| 1171 |
+
basic_distribution['neutral'] = 1
|
| 1172 |
+
|
| 1173 |
+
# Ensure fine_grained_distribution has at least one entry if empty
|
| 1174 |
+
if not fine_grained_distribution:
|
| 1175 |
+
fine_grained_distribution['neutral'] = 1
|
| 1176 |
+
|
| 1177 |
+
result = {
|
| 1178 |
+
"basic": basic_distribution,
|
| 1179 |
+
"fine_grained": fine_grained_distribution,
|
| 1180 |
+
"avg_score": round(avg_sentiment_score, 3),
|
| 1181 |
+
"volatility": round(sentiment_volatility, 3),
|
| 1182 |
+
"sentiment_indices": avg_indices
|
| 1183 |
+
}
|
| 1184 |
+
|
| 1185 |
+
# Debug: Print final sentiment distribution result
|
| 1186 |
+
print("Final sentiment distribution result:")
|
| 1187 |
+
print(result)
|
| 1188 |
+
|
| 1189 |
+
return result
|
| 1190 |
+
|
| 1191 |
+
def _analyze_topics(self, articles: List[Dict[str, Any]]) -> List[str]:
|
| 1192 |
+
"""Analyze common topics across articles using TF-IDF."""
|
| 1193 |
+
try:
|
| 1194 |
+
# Combine title and content for better topic extraction
|
| 1195 |
+
texts = [f"{article.get('title', '')} {article.get('content', '')}" for article in articles]
|
| 1196 |
+
|
| 1197 |
+
# Create and fit TF-IDF
|
| 1198 |
+
vectorizer = TfidfVectorizer(
|
| 1199 |
+
max_features=10,
|
| 1200 |
+
stop_words='english',
|
| 1201 |
+
ngram_range=(1, 2),
|
| 1202 |
+
token_pattern=r'(?u)\b[A-Za-z][A-Za-z+\'-]*[A-Za-z]+\b' # Improved pattern
|
| 1203 |
+
)
|
| 1204 |
+
|
| 1205 |
+
# Clean and normalize texts
|
| 1206 |
+
cleaned_texts = []
|
| 1207 |
+
for text in texts:
|
| 1208 |
+
# Remove numbers and special characters
|
| 1209 |
+
cleaned = re.sub(r'\d+', '', text)
|
| 1210 |
+
cleaned = re.sub(r'[^\w\s]', ' ', cleaned)
|
| 1211 |
+
cleaned_texts.append(cleaned.lower())
|
| 1212 |
+
|
| 1213 |
+
tfidf_matrix = vectorizer.fit_transform(cleaned_texts)
|
| 1214 |
+
feature_names = vectorizer.get_feature_names_out()
|
| 1215 |
+
|
| 1216 |
+
# Get average TF-IDF scores for each term
|
| 1217 |
+
avg_scores = tfidf_matrix.mean(axis=0).A1
|
| 1218 |
+
|
| 1219 |
+
# Sort terms by score and return top meaningful terms
|
| 1220 |
+
sorted_indices = avg_scores.argsort()[-5:][::-1]
|
| 1221 |
+
meaningful_topics = []
|
| 1222 |
+
|
| 1223 |
+
for idx in sorted_indices:
|
| 1224 |
+
topic = feature_names[idx]
|
| 1225 |
+
# Filter out single characters and common words
|
| 1226 |
+
if len(topic) > 1 and topic not in {'000', 'com', 'said', 'says', 'year', 'new', 'one'}:
|
| 1227 |
+
meaningful_topics.append(topic)
|
| 1228 |
+
if len(meaningful_topics) >= 5:
|
| 1229 |
+
break
|
| 1230 |
+
|
| 1231 |
+
return meaningful_topics
|
| 1232 |
+
|
| 1233 |
+
except Exception as e:
|
| 1234 |
+
print(f"Error analyzing topics: {str(e)}")
|
| 1235 |
+
return []
|
| 1236 |
+
|
| 1237 |
+
def _analyze_coverage_differences(self, articles: List[Dict[str, Any]]) -> List[str]:
|
| 1238 |
+
"""Analyze how coverage differs across articles."""
|
| 1239 |
+
if not articles:
|
| 1240 |
+
return ["No articles available for comparison"]
|
| 1241 |
+
|
| 1242 |
+
differences = []
|
| 1243 |
+
|
| 1244 |
+
# Compare sentiment differences
|
| 1245 |
+
sentiments = [article.get('sentiment', 'neutral').lower() for article in articles]
|
| 1246 |
+
unique_sentiments = set(sentiments)
|
| 1247 |
+
if len(unique_sentiments) > 1:
|
| 1248 |
+
pos_count = sentiments.count('positive')
|
| 1249 |
+
neg_count = sentiments.count('negative')
|
| 1250 |
+
neu_count = sentiments.count('neutral')
|
| 1251 |
+
|
| 1252 |
+
if pos_count > 0 and neg_count > 0:
|
| 1253 |
+
differences.append(f"Coverage sentiment varies significantly: {pos_count} positive, {neg_count} negative, and {neu_count} neutral articles.")
|
| 1254 |
+
|
| 1255 |
+
# Compare fine-grained sentiment differences
|
| 1256 |
+
fine_grained_categories = []
|
| 1257 |
+
for article in articles:
|
| 1258 |
+
fine_grained = article.get('fine_grained_sentiment', {})
|
| 1259 |
+
if isinstance(fine_grained, dict) and 'category' in fine_grained:
|
| 1260 |
+
category = fine_grained['category']
|
| 1261 |
+
if isinstance(category, str):
|
| 1262 |
+
fine_grained_categories.append(category.lower())
|
| 1263 |
+
|
| 1264 |
+
unique_categories = set(fine_grained_categories)
|
| 1265 |
+
if len(unique_categories) > 2: # More than 2 different categories
|
| 1266 |
+
category_counts = {}
|
| 1267 |
+
for category in fine_grained_categories:
|
| 1268 |
+
category_counts[category] = category_counts.get(category, 0) + 1
|
| 1269 |
+
|
| 1270 |
+
top_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:3]
|
| 1271 |
+
categories_str = ", ".join([f"{cat} ({count})" for cat, count in top_categories])
|
| 1272 |
+
differences.append(f"Articles show diverse sentiment categories: {categories_str}")
|
| 1273 |
+
|
| 1274 |
+
# Compare sentiment indices
|
| 1275 |
+
indices_differences = []
|
| 1276 |
+
positivity_values = []
|
| 1277 |
+
negativity_values = []
|
| 1278 |
+
controversy_values = []
|
| 1279 |
+
|
| 1280 |
+
for article in articles:
|
| 1281 |
+
indices = article.get('sentiment_indices', {})
|
| 1282 |
+
if indices:
|
| 1283 |
+
if 'positivity_index' in indices:
|
| 1284 |
+
positivity_values.append(indices['positivity_index'])
|
| 1285 |
+
if 'negativity_index' in indices:
|
| 1286 |
+
negativity_values.append(indices['negativity_index'])
|
| 1287 |
+
if 'controversy_score' in indices:
|
| 1288 |
+
controversy_values.append(indices['controversy_score'])
|
| 1289 |
+
|
| 1290 |
+
# Check for high variance in positivity
|
| 1291 |
+
if positivity_values and len(positivity_values) > 1:
|
| 1292 |
+
if max(positivity_values) - min(positivity_values) > 0.4:
|
| 1293 |
+
indices_differences.append("Articles show significant variation in positivity levels")
|
| 1294 |
+
|
| 1295 |
+
# Check for high variance in negativity
|
| 1296 |
+
if negativity_values and len(negativity_values) > 1:
|
| 1297 |
+
if max(negativity_values) - min(negativity_values) > 0.4:
|
| 1298 |
+
indices_differences.append("Articles show significant variation in negativity levels")
|
| 1299 |
+
|
| 1300 |
+
# Check for high controversy scores
|
| 1301 |
+
if controversy_values:
|
| 1302 |
+
high_controversy = [v for v in controversy_values if v > 0.5]
|
| 1303 |
+
if high_controversy:
|
| 1304 |
+
indices_differences.append(f"{len(high_controversy)} articles show high controversy scores")
|
| 1305 |
+
|
| 1306 |
+
if indices_differences:
|
| 1307 |
+
differences.append("Sentiment index analysis: " + "; ".join(indices_differences))
|
| 1308 |
+
|
| 1309 |
+
# Compare source differences
|
| 1310 |
+
sources = [article.get('source', '').lower() for article in articles]
|
| 1311 |
+
source_counts = {}
|
| 1312 |
+
for source in sources:
|
| 1313 |
+
if source:
|
| 1314 |
+
source_counts[source] = source_counts.get(source, 0) + 1
|
| 1315 |
+
|
| 1316 |
+
if len(source_counts) > 1:
|
| 1317 |
+
top_sources = sorted(source_counts.items(), key=lambda x: x[1], reverse=True)[:3]
|
| 1318 |
+
sources_str = ", ".join([f"{source} ({count})" for source, count in top_sources])
|
| 1319 |
+
differences.append(f"Coverage spans multiple sources: {sources_str}")
|
| 1320 |
+
|
| 1321 |
+
# If no significant differences found
|
| 1322 |
+
if not differences:
|
| 1323 |
+
differences.append("Coverage is relatively consistent across articles")
|
| 1324 |
+
|
| 1325 |
+
return differences
|
| 1326 |
+
|
| 1327 |
+
def _get_final_sentiment(self, distribution: Dict[str, Any], articles: List[Dict[str, Any]]) -> str:
|
| 1328 |
+
"""Generate final sentiment analysis based on distribution and article content."""
|
| 1329 |
+
try:
|
| 1330 |
+
# Get basic sentiment counts
|
| 1331 |
+
basic_dist = distribution.get('basic', {})
|
| 1332 |
+
positive_count = basic_dist.get('positive', 0)
|
| 1333 |
+
negative_count = basic_dist.get('negative', 0)
|
| 1334 |
+
neutral_count = basic_dist.get('neutral', 0)
|
| 1335 |
+
|
| 1336 |
+
total_articles = positive_count + negative_count + neutral_count
|
| 1337 |
+
|
| 1338 |
+
if total_articles == 0:
|
| 1339 |
+
return "No sentiment data available"
|
| 1340 |
+
|
| 1341 |
+
# Calculate percentages
|
| 1342 |
+
positive_pct = (positive_count / total_articles) * 100
|
| 1343 |
+
negative_pct = (negative_count / total_articles) * 100
|
| 1344 |
+
neutral_pct = (neutral_count / total_articles) * 100
|
| 1345 |
+
|
| 1346 |
+
# Get average sentiment score
|
| 1347 |
+
avg_score = distribution.get('avg_score', 0.5)
|
| 1348 |
+
|
| 1349 |
+
# Get volatility
|
| 1350 |
+
volatility = distribution.get('volatility', 0)
|
| 1351 |
+
|
| 1352 |
+
# Get sentiment indices
|
| 1353 |
+
indices = distribution.get('sentiment_indices', {})
|
| 1354 |
+
positivity_index = indices.get('positivity_index', 0.5)
|
| 1355 |
+
negativity_index = indices.get('negativity_index', 0.5)
|
| 1356 |
+
emotional_intensity = indices.get('emotional_intensity', 0)
|
| 1357 |
+
controversy_score = indices.get('controversy_score', 0)
|
| 1358 |
+
esg_relevance = indices.get('esg_relevance', 0)
|
| 1359 |
+
|
| 1360 |
+
# Generate analysis text
|
| 1361 |
+
analysis = []
|
| 1362 |
+
|
| 1363 |
+
# Overall sentiment
|
| 1364 |
+
if positive_pct > 60:
|
| 1365 |
+
analysis.append(f"Overall sentiment is predominantly positive ({positive_pct:.1f}%).")
|
| 1366 |
+
elif negative_pct > 60:
|
| 1367 |
+
analysis.append(f"Overall sentiment is predominantly negative ({negative_pct:.1f}%).")
|
| 1368 |
+
elif neutral_pct > 60:
|
| 1369 |
+
analysis.append(f"Overall sentiment is predominantly neutral ({neutral_pct:.1f}%).")
|
| 1370 |
+
elif positive_pct > negative_pct and positive_pct > neutral_pct:
|
| 1371 |
+
analysis.append(f"Overall sentiment leans positive ({positive_pct:.1f}%), with some mixed coverage.")
|
| 1372 |
+
elif negative_pct > positive_pct and negative_pct > neutral_pct:
|
| 1373 |
+
analysis.append(f"Overall sentiment leans negative ({negative_pct:.1f}%), with some mixed coverage.")
|
| 1374 |
+
else:
|
| 1375 |
+
analysis.append(f"Sentiment is mixed across sources (Positive: {positive_pct:.1f}%, Negative: {negative_pct:.1f}%, Neutral: {neutral_pct:.1f}%).")
|
| 1376 |
+
|
| 1377 |
+
# Sentiment indices insights
|
| 1378 |
+
if positivity_index > 0.7:
|
| 1379 |
+
analysis.append(f"High positivity index ({positivity_index:.2f}) indicates strong positive sentiment.")
|
| 1380 |
+
elif positivity_index < 0.3 and negativity_index > 0.7:
|
| 1381 |
+
analysis.append(f"High negativity index ({negativity_index:.2f}) with low positivity suggests strongly negative coverage.")
|
| 1382 |
+
|
| 1383 |
+
if emotional_intensity > 0.6:
|
| 1384 |
+
analysis.append(f"Coverage shows high emotional intensity ({emotional_intensity:.2f}).")
|
| 1385 |
+
|
| 1386 |
+
if controversy_score > 0.5:
|
| 1387 |
+
analysis.append(f"Coverage shows significant controversy ({controversy_score:.2f}), with polarized opinions.")
|
| 1388 |
+
|
| 1389 |
+
if esg_relevance > 0.4:
|
| 1390 |
+
analysis.append(f"Coverage includes significant ESG-related content ({esg_relevance:.2f}).")
|
| 1391 |
+
|
| 1392 |
+
# Volatility
|
| 1393 |
+
if volatility > 0.2:
|
| 1394 |
+
analysis.append(f"Sentiment varies considerably across articles (volatility: {volatility:.2f}).")
|
| 1395 |
+
else:
|
| 1396 |
+
analysis.append(f"Sentiment is relatively consistent across articles (volatility: {volatility:.2f}).")
|
| 1397 |
+
|
| 1398 |
+
return " ".join(analysis)
|
| 1399 |
+
|
| 1400 |
+
except Exception as e:
|
| 1401 |
+
print(f"Error generating final sentiment: {str(e)}")
|
| 1402 |
+
return "Unable to generate final sentiment analysis due to an error."
|