Spaces:

Rickyheyhey
/

finbert_market_evaluation

Running

App Files Files Community

RickyGuoTheCrazish commited on Jun 30

Commit

33c14bd

1 Parent(s): 14eed51

update finbert_market_evaluation

Browse files

Files changed (9) hide show

README.md +45 -4
requirements.txt +23 -3
run_app.py +37 -0
src/__init__.py +1 -0
src/evaluation.py +294 -0
src/market_data.py +297 -0
src/sentiment_analyzer.py +120 -0
src/streamlit_app.py +369 -35
src/visualizations.py +302 -0

README.md CHANGED Viewed

@@ -7,14 +7,55 @@ sdk: docker
 app_port: 8501
 tags:
 - streamlit
 pinned: false
 short_description: Evaluate FinBERT’s sentiment predictions against market data
 license: mit
 ---
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 app_port: 8501
 tags:
 - streamlit
+- finbert
+- sentiment-analysis
+- finance
+- machine-learning
 pinned: false
 short_description: Evaluate FinBERT’s sentiment predictions against market data
 license: mit
 ---
+# 🚀 FinBERT Market Evaluation
+Evaluate how well FinBERT's financial sentiment predictions match actual stock market movements.
+## What It Does
+Enter financial news → Get FinBERT sentiment → Compare with actual stock price movement → See if the prediction was right.
+## How to Use
+1. **Paste financial news** (e.g., "Apple reports record earnings")
+2. **Enter stock ticker** (e.g., AAPL)
+3. **Select news date** (when the news was published)
+4. **Get results** - see if sentiment matched price movement
+## Key Features
+- **Smart thresholds** - Uses each stock's volatility (no rigid ±1% rules)
+- **Same-day + 24h analysis** - Immediate reaction + follow-through
+- **Graded scoring** - Not just right/wrong, but how right (0-1 score)
+- **Market context** - Compares stock vs overall market performance
+## Example
+**News**: "Tesla announces new factory in Germany"
+- **FinBERT says**: Positive sentiment (85% confidence)
+- **Stock moved**: +4.2% same day
+- **Evaluation**: ✅ Aligned (sentiment matched direction)
+- **Score**: 0.91/1.0 (excellent alignment)
+## Installation
+```bash
+pip install -r requirements.txt
+streamlit run src/streamlit_app.py
+```
+## Limitations
+- Research tool only (not for trading)
+- 30-second rate limit between requests
+- Needs 1+ day old news (requires market data)
+- Uses Yahoo Finance (free but limited)

requirements.txt CHANGED Viewed

@@ -1,3 +1,23 @@
-altair
-pandas
-streamlit

+# Core Streamlit and data processing
+streamlit>=1.28.0
+pandas>=1.5.0
+numpy>=1.24.0
+altair>=4.2.0
+# Machine Learning and NLP
+transformers>=4.30.0
+torch>=2.0.0
+tokenizers>=0.13.0
+# Financial data
+yfinance>=0.2.18
+# Visualization and UI
+plotly>=5.15.0
+matplotlib>=3.7.0
+seaborn>=0.12.0
+# Utilities
+requests>=2.31.0
+python-dateutil>=2.8.0
+pytz>=2023.3

run_app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+#!/usr/bin/env python3
+"""
+Simple launcher script for the FinBERT Market Evaluation Streamlit app.
+"""
+import subprocess
+import sys
+import os
+def main():
+    """Launch the Streamlit application."""
+    print("🚀 Starting FinBERT Market Evaluation...")
+    print("=" * 50)
+    # Change to the correct directory
+    app_dir = os.path.dirname(os.path.abspath(__file__))
+    os.chdir(app_dir)
+    # Launch Streamlit
+    try:
+        cmd = [sys.executable, "-m", "streamlit", "run", "src/streamlit_app.py"]
+        print(f"Running: {' '.join(cmd)}")
+        print("=" * 50)
+        subprocess.run(cmd, check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Error launching Streamlit: {e}")
+        return 1
+    except KeyboardInterrupt:
+        print("\n👋 Application stopped by user")
+        return 0
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

src/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # FinBERT Market Evaluation Package

src/evaluation.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# Core evaluation engine with heuristic algorithms
+"""
+This module implements the core evaluation logic including DAS calculation,
+volatility-aware thresholds, WAT scoring, and macro-adjusted evaluation metrics.
+"""
+import numpy as np
+from typing import Dict, List, Optional
+import logging
+from datetime import datetime
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class EvaluationEngine:
+    """
+    Core engine for evaluating FinBERT predictions against market movements.
+    """
+    def __init__(self, volatility_multiplier: float = 1.0, confidence_threshold: float = 0.7):
+        """
+        Initialize the evaluation engine.
+        Args:
+            volatility_multiplier: k factor for volatility thresholds (default: 1.0)
+                                  threshold = k * 14-day_volatility
+            confidence_threshold: Minimum confidence for high-confidence predictions (default: 0.7)
+        """
+        self.volatility_multiplier = volatility_multiplier  # k = 1.0 per framework
+        self.confidence_threshold = confidence_threshold
+    def calculate_das(self, sentiment_direction: int, price_return: float,
+                     volatility: float) -> float:
+        """
+        Calculate Directional Alignment Score (DAS).
+        Args:
+            sentiment_direction: 1 for positive, -1 for negative, 0 for neutral
+            price_return: Stock return percentage
+            volatility: Stock volatility percentage
+        Returns:
+            DAS score between 0 and 1
+        """
+        try:
+            # Handle neutral sentiment
+            if sentiment_direction == 0:
+                # For neutral sentiment, score based on how close to zero the return is
+                threshold = volatility * self.volatility_multiplier
+                if abs(price_return) <= threshold:
+                    return 1.0  # Perfect neutral prediction
+                else:
+                    # Decay score based on how far from neutral
+                    excess = abs(price_return) - threshold
+                    return max(0.0, 1.0 - (excess / (threshold * 2)))
+            # For positive/negative sentiment
+            expected_direction = sentiment_direction
+            actual_direction = 1 if price_return > 0 else -1 if price_return < 0 else 0
+            # Base alignment check
+            if expected_direction == actual_direction:
+                # Correct direction - score based on magnitude
+                magnitude_factor = min(abs(price_return) / (volatility * self.volatility_multiplier), 2.0)
+                return min(1.0, 0.7 + 0.3 * magnitude_factor)
+            else:
+                # Wrong direction - score based on how wrong
+                threshold = volatility * self.volatility_multiplier
+                if abs(price_return) <= threshold:
+                    # Small move in wrong direction - partial credit
+                    return 0.3
+                else:
+                    # Large move in wrong direction - low score
+                    return max(0.0, 0.3 - (abs(price_return) - threshold) / (threshold * 3))
+        except Exception as e:
+            logger.error(f"Error calculating DAS: {str(e)}")
+            return 0.0
+    def calculate_wat_weight(self, confidence: float, impact: float,
+                           days_ago: int = 0, decay_factor: float = 0.95) -> float:
+        """
+        Calculate Weighted Accuracy over Time (WAT) weight.
+        Args:
+            confidence: Model confidence score
+            impact: Impact magnitude (absolute return)
+            days_ago: Days since prediction (for decay)
+            decay_factor: Decay factor for time-based weighting
+        Returns:
+            WAT weight for the prediction
+        """
+        try:
+            # Base weight from confidence and impact
+            confidence_weight = confidence
+            impact_weight = min(impact / 5.0, 2.0)  # Cap at 2x for very large moves
+            # Time decay (optional)
+            time_weight = decay_factor ** days_ago if days_ago > 0 else 1.0
+            # Combined weight
+            wat_weight = confidence_weight * impact_weight * time_weight
+            return float(wat_weight)
+        except Exception as e:
+            logger.error(f"Error calculating WAT weight: {str(e)}")
+            return 1.0
+    def evaluate_prediction(self, sentiment_data: Dict, market_data: Dict,
+                          news_date: datetime) -> Dict:
+        """
+        Comprehensive evaluation of a single prediction.
+        Args:
+            sentiment_data: Output from FinBERT analyzer
+            market_data: Output from market data service
+            news_date: Date when news was published
+        Returns:
+            Complete evaluation results
+        """
+        try:
+            # Extract key values
+            sentiment = sentiment_data.get("sentiment", "neutral")
+            confidence = sentiment_data.get("confidence", 0.0)
+            return_24h = market_data.get("return_24h")
+            volatility_14d = market_data.get("volatility_14d")
+            alpha_adjusted = market_data.get("alpha_adjusted")
+            # Check for missing data
+            if return_24h is None or volatility_14d is None:
+                return {
+                    "error": "Insufficient market data for evaluation",
+                    "sentiment": sentiment,
+                    "confidence": confidence
+                }
+            # Convert sentiment to direction
+            sentiment_direction = self._get_sentiment_direction(sentiment)
+            # Calculate volatility threshold
+            threshold = volatility_14d * self.volatility_multiplier
+            # Calculate DAS
+            das_score = self.calculate_das(sentiment_direction, return_24h, volatility_14d)
+            # Determine correctness
+            is_correct = self._is_prediction_correct(sentiment_direction, return_24h, threshold)
+            # Calculate WAT weight
+            impact = abs(return_24h)
+            wat_weight = self.calculate_wat_weight(confidence, impact)
+            # Prepare results
+            results = {
+                "ticker": market_data.get("ticker", "Unknown"),
+                "news_date": news_date.strftime("%Y-%m-%d"),
+                "sentiment": sentiment,
+                "confidence": confidence,
+                "return_24h": return_24h,
+                "volatility_14d": volatility_14d,
+                "threshold": threshold,
+                "das_score": das_score,
+                "is_correct": is_correct,
+                "wat_weight": wat_weight,
+                "impact": impact,
+                "alpha_adjusted": alpha_adjusted,
+                "sentiment_direction": sentiment_direction,
+                "evaluation_summary": self._generate_summary(
+                    sentiment, confidence, return_24h, das_score, is_correct
+                )
+            }
+            logger.info(f"Evaluation completed - DAS: {das_score:.3f}, Correct: {is_correct}")
+            return results
+        except Exception as e:
+            logger.error(f"Error in prediction evaluation: {str(e)}")
+            return {"error": str(e)}
+    def _get_sentiment_direction(self, sentiment: str) -> int:
+        """Convert sentiment to numerical direction."""
+        sentiment_map = {
+            "positive": 1,
+            "negative": -1,
+            "neutral": 0
+        }
+        return sentiment_map.get(sentiment.lower(), 0)
+    def _is_prediction_correct(self, sentiment_direction: int, price_return: float,
+                             threshold: float) -> bool:
+        """
+        Determine if prediction is correct based on volatility-aware thresholds.
+        """
+        if sentiment_direction == 0:  # Neutral
+            return abs(price_return) <= threshold
+        elif sentiment_direction == 1:  # Positive
+            return price_return > threshold
+        elif sentiment_direction == -1:  # Negative
+            return price_return < -threshold
+        else:
+            return False
+    def _generate_summary(self, sentiment: str, confidence: float,
+                         return_24h: float, das_score: float, is_correct: bool) -> str:
+        """Generate human-readable evaluation summary."""
+        direction = "📈" if return_24h > 0 else "📉" if return_24h < 0 else "➡️"
+        # More nuanced verdict based on DAS score
+        if is_correct:
+            verdict = "✅ Aligned"
+        else:
+            if das_score > 0.7:
+                verdict = "⚠️ Directionally Right, Magnitude Wrong"  # Right direction, wrong magnitude
+            elif das_score > 0.3:
+                verdict = "🔄 Partially Aligned"  # Some alignment
+            else:
+                verdict = "❌ Misaligned"  # Completely wrong
+        confidence_level = "High" if confidence > 0.8 else "Medium" if confidence > 0.6 else "Low"
+        return (f"{verdict} | {sentiment.title()} sentiment ({confidence_level} conf: {confidence:.2f}) "
+                f"vs {direction} {return_24h:+.2f}% return | DAS: {das_score:.3f}")
+    def calculate_batch_metrics(self, evaluations: List[Dict]) -> Dict:
+        """
+        Calculate aggregate metrics for a batch of evaluations.
+        Args:
+            evaluations: List of evaluation results
+        Returns:
+            Dictionary with aggregate metrics
+        """
+        try:
+            if not evaluations:
+                return {"error": "No evaluations provided"}
+            # Filter out error results
+            valid_evals = [e for e in evaluations if "error" not in e]
+            if not valid_evals:
+                return {"error": "No valid evaluations found"}
+            # Calculate metrics
+            das_scores = [e["das_score"] for e in valid_evals]
+            correctness = [e["is_correct"] for e in valid_evals]
+            confidences = [e["confidence"] for e in valid_evals]
+            wat_weights = [e["wat_weight"] for e in valid_evals]
+            # Aggregate metrics
+            avg_das = float(np.mean(das_scores))
+            accuracy = float(np.mean(correctness))
+            avg_confidence = float(np.mean(confidences))
+            # Weighted accuracy
+            weighted_correctness = [float(c) * float(w) for c, w in zip(correctness, wat_weights)]
+            total_weight = sum(wat_weights)
+            weighted_accuracy = float(sum(weighted_correctness) / total_weight) if total_weight > 0 else 0.0
+            # Confidence-accuracy correlation (handle single evaluation case)
+            if len(confidences) > 1:
+                try:
+                    corr_matrix = np.corrcoef(confidences, correctness)
+                    confidence_correlation = float(corr_matrix[0, 1])
+                    # Handle NaN case (when all values are the same)
+                    if np.isnan(confidence_correlation):
+                        confidence_correlation = 0.0
+                except:
+                    confidence_correlation = 0.0
+            else:
+                confidence_correlation = 0.0  # Cannot calculate correlation with single point
+            # Count high/low confidence predictions
+            high_confidence_count = sum(1 for c in confidences if c > self.confidence_threshold)
+            low_confidence_count = sum(1 for c in confidences if c < 0.6)
+            return {
+                "total_evaluations": len(valid_evals),
+                "average_das": avg_das,
+                "accuracy": accuracy,
+                "weighted_accuracy": weighted_accuracy,
+                "average_confidence": avg_confidence,
+                "confidence_accuracy_correlation": confidence_correlation,
+                "high_confidence_predictions": high_confidence_count,
+                "low_confidence_predictions": low_confidence_count
+            }
+        except Exception as e:
+            logger.error(f"Error calculating batch metrics: {str(e)}")
+            return {"error": str(e)}

src/market_data.py ADDED Viewed

	@@ -0,0 +1,297 @@

+# Market data fetching service using yfinance
+"""
+This module handles fetching historical stock price data, calculating returns,
+volatility, and market index comparisons for evaluation purposes.
+"""
+import yfinance as yf
+import pandas as pd
+import numpy as np
+from datetime import datetime, timedelta
+from typing import Dict, Optional, Tuple
+import logging
+import streamlit as st
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class MarketDataService:
+    """
+    Service for fetching and processing market data for evaluation.
+    """
+    def __init__(self, market_index: str = "^GSPC"):
+        """
+        Initialize the market data service.
+        Args:
+            market_index: The market index ticker for macro adjustments (default: S&P 500)
+        """
+        self.market_index = market_index
+    @st.cache_data(ttl=3600)  # Cache for 1 hour
+    def fetch_stock_data(_self, ticker: str, start_date: datetime, end_date: datetime) -> Optional[pd.DataFrame]:
+        """
+        Fetch historical stock data for a given ticker and date range.
+        Args:
+            ticker: Stock ticker symbol
+            start_date: Start date for data fetch
+            end_date: End date for data fetch
+        Returns:
+            DataFrame with stock price data or None if failed
+        """
+        try:
+            logger.info(f"Fetching data for {ticker} from {start_date} to {end_date}")
+            stock = yf.Ticker(ticker)
+            data = stock.history(start=start_date, end=end_date)
+            if data.empty:
+                logger.warning(f"No data found for ticker {ticker}")
+                return None
+            return data
+        except Exception as e:
+            logger.error(f"Error fetching data for {ticker}: {str(e)}")
+            return None
+    def calculate_same_day_return(self, data: pd.DataFrame, news_date: datetime) -> Optional[float]:
+        """
+        Calculate stock return on the same day the news was published (intraday).
+        Args:
+            data: Stock price DataFrame
+            news_date: Date when news was published
+        Returns:
+            Intraday return percentage or None if calculation fails
+        """
+        try:
+            # Convert news_date to date only for comparison
+            news_date_only = news_date.date()
+            # Find the trading day that matches the news date
+            data_dates = data.index.date
+            matching_dates = [d for d in data_dates if d == news_date_only]
+            if not matching_dates:
+                # If no exact match, find the next trading day
+                future_dates = [d for d in data_dates if d > news_date_only]
+                if not future_dates:
+                    logger.warning(f"No trading data available for or after {news_date_only}")
+                    return None
+                trading_date = future_dates[0]
+                logger.info(f"News date {news_date_only} was not a trading day, using next trading day: {trading_date}")
+            else:
+                trading_date = matching_dates[0]
+            # Get the day's data
+            day_data = data[data.index.date == trading_date]
+            if len(day_data) == 0:
+                logger.warning(f"No trading data found for {trading_date}")
+                return None
+            # Calculate intraday return: (Close - Open) / Open * 100
+            open_price = day_data['Open'].iloc[0]
+            close_price = day_data['Close'].iloc[-1]
+            return_pct = ((close_price - open_price) / open_price) * 100
+            logger.info(f"Calculated same-day return for {trading_date}: {return_pct:.2f}% (Open: {open_price:.2f}, Close: {close_price:.2f})")
+            return float(return_pct)
+        except Exception as e:
+            logger.error(f"Error calculating same-day return: {str(e)}")
+            return None
+    def calculate_next_24h_return(self, data: pd.DataFrame, news_date: datetime) -> Optional[float]:
+        """
+        Calculate stock return over the next 24 hours after news publication.
+        Args:
+            data: Stock price DataFrame
+            news_date: Date when news was published
+        Returns:
+            24-hour return percentage or None if calculation fails
+        """
+        try:
+            # Convert news_date to date only for comparison
+            news_date_only = news_date.date()
+            # Find the trading day that matches the news date
+            data_dates = data.index.date
+            matching_dates = [d for d in data_dates if d == news_date_only]
+            if not matching_dates:
+                # If no exact match, find the next trading day
+                future_dates = [d for d in data_dates if d > news_date_only]
+                if not future_dates:
+                    logger.warning(f"No trading data available for or after {news_date_only}")
+                    return None
+                start_trading_date = future_dates[0]
+            else:
+                start_trading_date = matching_dates[0]
+            # Find the next trading day for 24h comparison
+            future_dates = [d for d in data_dates if d > start_trading_date]
+            if not future_dates:
+                logger.warning(f"No next trading day available after {start_trading_date}")
+                return None
+            end_trading_date = future_dates[0]
+            # Get start and end prices
+            start_data = data[data.index.date == start_trading_date]
+            end_data = data[data.index.date == end_trading_date]
+            if len(start_data) == 0 or len(end_data) == 0:
+                logger.warning(f"Insufficient data for 24h return calculation")
+                return None
+            # Use close of start day and close of next day
+            start_price = start_data['Close'].iloc[-1]
+            end_price = end_data['Close'].iloc[-1]
+            return_pct = ((end_price - start_price) / start_price) * 100
+            logger.info(f"Calculated 24h return from {start_trading_date} to {end_trading_date}: {return_pct:.2f}%")
+            return float(return_pct)
+        except Exception as e:
+            logger.error(f"Error calculating 24h return: {str(e)}")
+            return None
+    def calculate_return(self, data: pd.DataFrame, news_date: datetime, hours: int = 24) -> Optional[float]:
+        """
+        Legacy method - now returns same-day return for compatibility.
+        Use calculate_same_day_return() or calculate_next_24h_return() for specific needs.
+        """
+        return self.calculate_same_day_return(data, news_date)
+    def calculate_volatility(self, data: pd.DataFrame, days: int = 14) -> Optional[float]:
+        """
+        Calculate rolling volatility for the stock.
+        Args:
+            data: Stock price DataFrame
+            days: Number of days for volatility calculation
+        Returns:
+            Volatility percentage or None if calculation fails
+        """
+        try:
+            if len(data) < days:
+                logger.warning(f"Insufficient data for {days}-day volatility calculation")
+                return None
+            # Calculate daily returns
+            data['Daily_Return'] = data['Close'].pct_change()
+            # Calculate rolling volatility (annualized)
+            volatility = data['Daily_Return'].rolling(window=days).std() * np.sqrt(252) * 100
+            # Return the most recent volatility
+            recent_volatility = volatility.dropna().iloc[-1]
+            logger.info(f"Calculated {days}-day volatility: {recent_volatility:.2f}%")
+            return float(recent_volatility)
+        except Exception as e:
+            logger.error(f"Error calculating volatility: {str(e)}")
+            return None
+    def get_market_return(self, news_date: datetime, hours: int = 24) -> Optional[float]:
+        """
+        Get market index return for the same day as news publication.
+        Args:
+            news_date: Date when news was published
+            hours: Deprecated parameter (kept for compatibility)
+        Returns:
+            Market return percentage for the news day or None if calculation fails
+        """
+        try:
+            # Fetch market data
+            start_date = news_date - timedelta(days=5)  # Buffer for weekends
+            end_date = news_date + timedelta(days=5)
+            market_data = self.fetch_stock_data(self.market_index, start_date, end_date)
+            if market_data is None:
+                return None
+            return self.calculate_return(market_data, news_date, hours)
+        except Exception as e:
+            logger.error(f"Error getting market return: {str(e)}")
+            return None
+    def get_stock_evaluation_data(self, ticker: str, news_date: datetime) -> Dict:
+        """
+        Get comprehensive stock data for evaluation including both same-day and 24h returns.
+        Args:
+            ticker: Stock ticker symbol
+            news_date: Date when news was published
+        Returns:
+            Dictionary containing all relevant market data
+        """
+        try:
+            # Define date range (get extra days for volatility calculation)
+            start_date = news_date - timedelta(days=30)
+            end_date = news_date + timedelta(days=5)
+            # Fetch stock data
+            stock_data = self.fetch_stock_data(ticker, start_date, end_date)
+            if stock_data is None:
+                return {"error": f"Could not fetch data for ticker {ticker}"}
+            # Calculate both same-day and 24h returns
+            same_day_return = self.calculate_same_day_return(stock_data, news_date)
+            next_24h_return = self.calculate_next_24h_return(stock_data, news_date)
+            volatility_14d = self.calculate_volatility(stock_data, 14)
+            # Get market returns for both periods
+            market_same_day = self.get_market_return(news_date, 0)  # Same day
+            market_24h = self.get_market_return(news_date, 24)      # 24h
+            # Calculate alpha-adjusted returns
+            alpha_same_day = None
+            alpha_24h = None
+            if same_day_return is not None and market_same_day is not None:
+                alpha_same_day = same_day_return - market_same_day
+            if next_24h_return is not None and market_24h is not None:
+                alpha_24h = next_24h_return - market_24h
+            return {
+                "ticker": ticker,
+                "return_same_day": same_day_return,
+                "return_next_24h": next_24h_return,
+                "return_24h": same_day_return,  # Keep for compatibility with existing code
+                "volatility_14d": volatility_14d,
+                "market_return_same_day": market_same_day,
+                "market_return_24h": market_24h,
+                "market_return": market_same_day,  # Keep for compatibility
+                "alpha_same_day": alpha_same_day,
+                "alpha_24h": alpha_24h,
+                "alpha_adjusted": alpha_same_day,  # Keep for compatibility
+                "data_points": len(stock_data),
+                "date_range": {
+                    "start": stock_data.index[0].strftime("%Y-%m-%d"),
+                    "end": stock_data.index[-1].strftime("%Y-%m-%d")
+                }
+            }
+        except Exception as e:
+            logger.error(f"Error getting evaluation data: {str(e)}")
+            return {"error": str(e)}

src/sentiment_analyzer.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# FinBERT sentiment analysis module for financial news
+"""
+This module handles loading the ProsusAI/finbert model and extracting
+sentiment predictions with confidence scores from financial news text.
+"""
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import streamlit as st
+from typing import Dict, Tuple
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class FinBERTAnalyzer:
+    """
+    A wrapper class for the ProsusAI/finbert model to analyze financial sentiment.
+    """
+    def __init__(self, model_name: str = "ProsusAI/finbert"):
+        """
+        Initialize the FinBERT analyzer.
+        Args:
+            model_name: The Hugging Face model identifier
+        """
+        self.model_name = model_name
+        self.tokenizer = None
+        self.model = None
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    @st.cache_resource
+    def load_model(_self):
+        """
+        Load the FinBERT model and tokenizer with caching.
+        Using _self to avoid hashing issues with streamlit cache.
+        """
+        try:
+            logger.info(f"Loading FinBERT model: {_self.model_name}")
+            _self.tokenizer = AutoTokenizer.from_pretrained(_self.model_name)
+            _self.model = AutoModelForSequenceClassification.from_pretrained(_self.model_name)
+            _self.model.to(_self.device)
+            _self.model.eval()
+            logger.info("FinBERT model loaded successfully")
+            return True
+        except Exception as e:
+            logger.error(f"Error loading FinBERT model: {str(e)}")
+            return False
+    def analyze_sentiment(self, text: str) -> Dict[str, float]:
+        """
+        Analyze sentiment of financial news text.
+        Args:
+            text: The financial news text to analyze
+        Returns:
+            Dictionary containing sentiment label, confidence, and raw scores
+        """
+        if not self.model or not self.tokenizer:
+            if not self.load_model():
+                raise RuntimeError("Failed to load FinBERT model")
+        try:
+            # Tokenize input
+            inputs = self.tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                padding=True,
+                max_length=512
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # Get predictions
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+            # Extract results
+            scores = predictions.cpu().numpy()[0]
+            labels = ["negative", "neutral", "positive"]
+            # Find the predicted sentiment and confidence
+            predicted_idx = scores.argmax()
+            predicted_sentiment = labels[predicted_idx]
+            confidence = float(scores[predicted_idx])
+            return {
+                "sentiment": predicted_sentiment,
+                "confidence": confidence,
+                "scores": {
+                    "negative": float(scores[0]),
+                    "neutral": float(scores[1]),
+                    "positive": float(scores[2])
+                }
+            }
+        except Exception as e:
+            logger.error(f"Error analyzing sentiment: {str(e)}")
+            raise RuntimeError(f"Sentiment analysis failed: {str(e)}")
+    def get_sentiment_direction(self, sentiment: str) -> int:
+        """
+        Convert sentiment label to numerical direction for evaluation.
+        Args:
+            sentiment: The sentiment label ("positive", "negative", "neutral")
+        Returns:
+            1 for positive, -1 for negative, 0 for neutral
+        """
+        sentiment_map = {
+            "positive": 1,
+            "negative": -1,
+            "neutral": 0
+        }
+        return sentiment_map.get(sentiment.lower(), 0)

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,374 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+# FinBERT Market Evaluation - Main Streamlit Application
+"""
+A confidence-aware, volatility-adjusted post-market evaluator for FinBERT sentiment
+predictions against actual stock market movements.
+"""
 import streamlit as st
+import pandas as pd
+import numpy as np
+import plotly.graph_objects as go
+import plotly.express as px
+from datetime import datetime, timedelta, date
+import time
+import logging
+# Import our custom modules
+from sentiment_analyzer import FinBERTAnalyzer
+from market_data import MarketDataService
+from evaluation import EvaluationEngine
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Page configuration
+st.set_page_config(
+    page_title="FinBERT Market Evaluation",
+    page_icon="🚀",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Initialize session state for rate limiting
+if 'last_request_time' not in st.session_state:
+    st.session_state.last_request_time = 0
+if 'evaluation_history' not in st.session_state:
+    st.session_state.evaluation_history = []
+# Initialize services
+@st.cache_resource
+def initialize_services():
+    """Initialize all services with caching."""
+    analyzer = FinBERTAnalyzer()
+    market_service = MarketDataService()
+    evaluation_engine = EvaluationEngine()
+    return analyzer, market_service, evaluation_engine
+def check_rate_limit():
+    """Check if rate limit allows new request (30 seconds)."""
+    current_time = time.time()
+    time_since_last = current_time - st.session_state.last_request_time
+    return time_since_last >= 30
+def update_rate_limit():
+    """Update the last request time."""
+    st.session_state.last_request_time = time.time()
+def create_das_chart(das_score: float, confidence: float, impact: float):
+    """Create horizontal bar chart for DAS, confidence, and impact."""
+    fig = go.Figure()
+    metrics = ['DAS Score', 'Confidence', 'Impact (scaled)']
+    values = [das_score, confidence, min(impact / 5.0, 1.0)]  # Scale impact to 0-1
+    colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
+    fig.add_trace(go.Bar(
+        y=metrics,
+        x=values,
+        orientation='h',
+        marker_color=colors,
+        text=[f'{v:.3f}' for v in values],
+        textposition='inside'
+    ))
+    fig.update_layout(
+        title="Evaluation Metrics",
+        xaxis_title="Score",
+        height=200,
+        margin=dict(l=100, r=50, t=50, b=50)
+    )
+    return fig
+def display_evaluation_result(result: dict):
+    """Display comprehensive evaluation results."""
+    if "error" in result:
+        st.error(f"Evaluation Error: {result['error']}")
+        return
+    # Prominent evaluation summary first
+    st.markdown(f"### {result['evaluation_summary']}")
+    # Key insights in a highlighted box
+    alignment_color = "green" if result['is_correct'] else "red"
+    volatility_note = "🔥 Extremely High" if result['volatility_14d'] > 100 else "📊 High" if result['volatility_14d'] > 50 else "📈 Normal"
+    # Calculate if movement was significant
+    movement_significant = result['impact'] > result['threshold']
+    significance_text = "exceeded" if movement_significant else "was below"
+    st.markdown(f"""
+    <div style="background-color: rgba(0,0,0,0.1); padding: 15px; border-radius: 10px; margin: 10px 0;">
+    <h4>📊 Volatility-Aware Analysis:</h4>
+    <ul>
+    <li><strong>Stock's 14-day volatility:</strong> {result['volatility_14d']:.1f}% ({volatility_note.lower()})</li>
+    <li><strong>Significance threshold:</strong> {result['threshold']:.1f}% (= 1.0 × volatility)</li>
+    <li><strong>Actual movement:</strong> {result['return_24h']:+.2f}% ({result['impact']:.2f}% magnitude)</li>
+    <li><strong>Movement significance:</strong> {significance_text} threshold → {'Significant' if movement_significant else 'Not significant'}</li>
+    <li><strong>Directional alignment:</strong> <span style="color: {alignment_color};">{'✅ Correct direction' if result['is_correct'] else '❌ Wrong direction or insufficient magnitude'}</span></li>
+    <li><strong>Model confidence:</strong> {'High' if result['confidence'] > 0.8 else 'Medium' if result['confidence'] > 0.6 else 'Low'} ({result['confidence']:.1%})</li>
+    </ul>
+    </div>
+    """, unsafe_allow_html=True)
+    # Main metrics in columns
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("DAS Score", f"{result['das_score']:.3f}", help="Directional Alignment Score (0-1, higher is better)")
+    with col2:
+        sentiment_emoji = {"positive": "📈", "negative": "📉", "neutral": "➡️"}
+        st.metric("Sentiment", f"{sentiment_emoji.get(result['sentiment'], '❓')} {result['sentiment'].title()}")
+    with col3:
+        st.metric("Confidence", f"{result['confidence']:.1%}")
+    with col4:
+        return_color = "normal" if abs(result['return_24h']) < result['threshold'] else "inverse"
+        st.metric("Same-Day Return", f"{result['return_24h']:+.2f}%", delta=f"vs {result['threshold']:.1f}% threshold")
+    # Additional metrics for 24h return if available
+    if result.get('return_next_24h') is not None:
+        col1, col2, col3, col4 = st.columns(4)
+        with col1:
+            st.metric("Next 24h Return", f"{result['return_next_24h']:+.2f}%", help="Return from close of news day to close of next trading day")
+        with col2:
+            if result.get('alpha_24h') is not None:
+                st.metric("24h Alpha", f"{result['alpha_24h']:+.2f}%", help="24h return vs market performance")
+        with col3:
+            # Show combined impact
+            combined_impact = abs(result['return_24h']) + abs(result.get('return_next_24h', 0))
+            st.metric("Combined Impact", f"{combined_impact:.2f}%", help="Total magnitude of price movement")
+        with col4:
+            # Show follow-through consistency
+            same_direction = (result['return_24h'] * result.get('return_next_24h', 0)) > 0
+            consistency = "✅ Consistent" if same_direction else "🔄 Reversal"
+            st.metric("Follow-through", consistency, help="Whether 24h movement continued same direction")
+    # Visualization
+    chart = create_das_chart(result['das_score'], result['confidence'], result['impact'])
+    # Use session state to create unique chart counter
+    if 'chart_counter' not in st.session_state:
+        st.session_state.chart_counter = 0
+    st.session_state.chart_counter += 1
+    chart_key = f"chart_{st.session_state.chart_counter}"
+    st.plotly_chart(chart, use_container_width=True, key=chart_key)
+    # Technical metrics (always visible)
+    st.subheader("📊 Technical Metrics")
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.metric("Ticker", result['ticker'])
+        st.metric("News Date", result['news_date'])
+        st.metric("14-day Volatility", f"{result['volatility_14d']:.2f}%")
+        st.metric("Significance Threshold", f"{result['threshold']:.2f}%")
+    with col2:
+        st.metric("Same-Day Impact", f"{result['impact']:.2f}%")
+        if result.get('return_next_24h') is not None:
+            st.metric("24h Impact", f"{abs(result['return_next_24h']):.2f}%")
+        st.metric("WAT Weight", f"{result['wat_weight']:.3f}")
+        alignment_text = "✅ Yes" if result['is_correct'] else "❌ No"
+        st.metric("Alignment", alignment_text)
+    with col3:
+        alpha_val = result.get('alpha_adjusted', 'N/A')
+        alpha_str = f"{alpha_val:+.2f}%" if isinstance(alpha_val, (int, float)) else str(alpha_val)
+        st.metric("Same-Day Alpha", alpha_str)
+        if result.get('alpha_24h') is not None:
+            st.metric("24h Alpha", f"{result['alpha_24h']:+.2f}%")
+        # Market context
+        market_same = result.get('market_return', 'N/A')
+        market_str = f"{market_same:+.2f}%" if isinstance(market_same, (int, float)) else str(market_same)
+        st.metric("Market Return", market_str)
+def main():
+    """Main application function."""
+    # Header
+    st.title("🚀 FinBERT Market Evaluation")
+    st.markdown("""
+    A confidence-aware, volatility-adjusted post-market evaluator for FinBERT sentiment predictions.
+    Evaluate how well FinBERT's financial news sentiment aligns with actual stock market movements.
+    """)
+    # Sidebar info (no user configuration needed)
+    st.sidebar.header("📊 Evaluation Framework")
+    st.sidebar.markdown("""
+    **Dual-Period Analysis:**
+    - **Same-Day**: Intraday return (Close - Open)
+    - **Next 24h**: Close-to-close follow-through
+    - **Combined**: Complete market reaction picture
+    **Volatility-Aware Evaluation:**
+    - Uses each stock's 14-day volatility
+    - Threshold = 1.0 × volatility (k=1.0)
+    - Adapts to stock movement patterns
+    **Directional Alignment Score:**
+    - Graded 0-1 score (not binary)
+    - Based on same-day return vs threshold
+    - Higher = better alignment
+    **Alpha Analysis:**
+    - Stock return vs market performance
+    - Isolates stock-specific impact
+    - Available for both time periods
+    """)
+    # Fixed research parameters (not user-configurable)
+    volatility_multiplier = 1.0  # k = 1.0 as per your framework
+    confidence_threshold = 0.7   # Reasonable default
+    # Initialize services
+    try:
+        analyzer, market_service, evaluation_engine = initialize_services()
+        evaluation_engine.volatility_multiplier = volatility_multiplier
+        evaluation_engine.confidence_threshold = confidence_threshold
+    except Exception as e:
+        st.error(f"Failed to initialize services: {str(e)}")
+        st.stop()
+    # Main input form
+    st.header("📰 News Analysis")
+    with st.form("evaluation_form"):
+        # News text input
+        news_text = st.text_area(
+            "Financial News Text",
+            height=150,
+            placeholder="Enter financial news headline or summary here...",
+            help="Paste the financial news text you want to analyze"
+        )
+        col1, col2 = st.columns(2)
+        with col1:
+            ticker = st.text_input(
+                "Stock Ticker",
+                placeholder="e.g., TSLA, AAPL, MSFT",
+                help="Enter the stock ticker symbol"
+            ).upper()
+        with col2:
+            news_date = st.date_input(
+                "News Publication Date",
+                value=date.today() - timedelta(days=1),
+                max_value=date.today() - timedelta(days=1),
+                help="Date when the news was published (must be at least 1 day ago)"
+            )
+        submitted = st.form_submit_button("🔍 Evaluate Prediction")
+    # Process evaluation
+    if submitted:
+        if not news_text.strip():
+            st.error("Please enter some news text to analyze.")
+            return
+        if not ticker:
+            st.error("Please enter a stock ticker symbol.")
+            return
+        # Rate limiting check
+        if not check_rate_limit():
+            remaining_time = 30 - (time.time() - st.session_state.last_request_time)
+            st.warning(f"Rate limit: Please wait {remaining_time:.0f} more seconds before next request.")
+            return
+        # Update rate limit
+        update_rate_limit()
+        # Show progress
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        try:
+            # Step 1: Sentiment Analysis
+            status_text.text("🤖 Analyzing sentiment with FinBERT...")
+            progress_bar.progress(25)
+            sentiment_result = analyzer.analyze_sentiment(news_text)
+            # Step 2: Market Data
+            status_text.text("📊 Fetching market data...")
+            progress_bar.progress(50)
+            news_datetime = datetime.combine(news_date, datetime.min.time())
+            market_result = market_service.get_stock_evaluation_data(ticker, news_datetime)
+            # Step 3: Evaluation
+            status_text.text("⚖️ Evaluating prediction...")
+            progress_bar.progress(75)
+            evaluation_result = evaluation_engine.evaluate_prediction(
+                sentiment_result, market_result, news_datetime
+            )
+            # Step 4: Display Results
+            status_text.text("✅ Evaluation complete!")
+            progress_bar.progress(100)
+            # Clear progress indicators
+            time.sleep(0.5)
+            progress_bar.empty()
+            status_text.empty()
+            # Display results
+            st.header("📊 Evaluation Results")
+            display_evaluation_result(evaluation_result)
+            # Add to history
+            if "error" not in evaluation_result:
+                st.session_state.evaluation_history.append(evaluation_result)
+        except Exception as e:
+            progress_bar.empty()
+            status_text.empty()
+            st.error(f"Evaluation failed: {str(e)}")
+            logger.error(f"Evaluation error: {str(e)}")
+    # Evaluation History Section
+    if st.session_state.evaluation_history:
+        st.header("📋 Previous Evaluations")
+        # Show most recent evaluations first (reverse chronological)
+        recent_evaluations = list(reversed(st.session_state.evaluation_history))
+        # Show recent evaluations in expandable cards
+        for i, result in enumerate(recent_evaluations):
+            # Create a concise title for each evaluation
+            alignment_icon = "✅" if result['is_correct'] else "❌"
+            sentiment_icon = {"positive": "📈", "negative": "📉", "neutral": "➡️"}.get(result['sentiment'], "❓")
+            title = f"{alignment_icon} {result['ticker']} ({result['news_date']}) - {sentiment_icon} {result['sentiment'].title()} → {result['return_24h']:+.1f}% | DAS: {result['das_score']:.3f}"
+            with st.expander(title, expanded=(i==0)):  # Expand the most recent one
+                display_evaluation_result(result)
+        # Simple action buttons
+        st.markdown("---")
+        # Simple action buttons
+        col1, col2 = st.columns([1, 3])
+        with col1:
+            if st.button("🗑️ Clear All History"):
+                st.session_state.evaluation_history = []
+                st.rerun()
+        with col2:
+            st.caption(f"📊 {len(st.session_state.evaluation_history)} evaluation(s) completed")
+    # Footer
+    st.markdown("---")
+    st.caption("🚀 **FinBERT Market Evaluation** | Rate limit: 30s | Model: ProsusAI/finbert | Data: Yahoo Finance")
+if __name__ == "__main__":
+    main()

src/visualizations.py ADDED Viewed

	@@ -0,0 +1,302 @@

+# Visualization components for FinBERT Market Evaluation
+"""
+This module provides additional visualization components including
+calibration plots, correlation heatmaps, and performance charts.
+"""
+import plotly.graph_objects as go
+import plotly.express as px
+import pandas as pd
+import numpy as np
+from typing import List, Dict
+import streamlit as st
+def create_calibration_plot(evaluations: List[Dict]) -> go.Figure:
+    """
+    Create a calibration plot showing confidence vs actual accuracy.
+    Args:
+        evaluations: List of evaluation results
+    Returns:
+        Plotly figure for calibration plot
+    """
+    if not evaluations:
+        return go.Figure()
+    # Extract confidence and correctness
+    confidences = [e['confidence'] for e in evaluations if 'confidence' in e]
+    correctness = [e['is_correct'] for e in evaluations if 'is_correct' in e]
+    if len(confidences) != len(correctness) or len(confidences) < 5:
+        return go.Figure()
+    # Create confidence bins
+    bins = np.linspace(0.5, 1.0, 6)  # 5 bins from 0.5 to 1.0
+    bin_centers = (bins[:-1] + bins[1:]) / 2
+    # Calculate accuracy for each bin
+    bin_accuracies = []
+    bin_counts = []
+    for i in range(len(bins) - 1):
+        mask = (np.array(confidences) >= bins[i]) & (np.array(confidences) < bins[i + 1])
+        if i == len(bins) - 2:  # Last bin includes upper bound
+            mask = (np.array(confidences) >= bins[i]) & (np.array(confidences) <= bins[i + 1])
+        bin_correct = np.array(correctness)[mask]
+        if len(bin_correct) > 0:
+            bin_accuracies.append(np.mean(bin_correct))
+            bin_counts.append(len(bin_correct))
+        else:
+            bin_accuracies.append(0)
+            bin_counts.append(0)
+    # Create figure
+    fig = go.Figure()
+    # Perfect calibration line
+    fig.add_trace(go.Scatter(
+        x=[0.5, 1.0],
+        y=[0.5, 1.0],
+        mode='lines',
+        name='Perfect Calibration',
+        line=dict(dash='dash', color='gray')
+    ))
+    # Actual calibration
+    fig.add_trace(go.Scatter(
+        x=bin_centers,
+        y=bin_accuracies,
+        mode='markers+lines',
+        name='Actual Calibration',
+        marker=dict(size=[c/2 + 5 for c in bin_counts]),  # Size by count
+        text=[f'Count: {c}' for c in bin_counts],
+        hovertemplate='Confidence: %{x:.2f}<br>Accuracy: %{y:.2f}<br>%{text}<extra></extra>'
+    ))
+    fig.update_layout(
+        title='Calibration Plot: Confidence vs Accuracy',
+        xaxis_title='Predicted Confidence',
+        yaxis_title='Actual Accuracy',
+        xaxis=dict(range=[0.5, 1.0]),
+        yaxis=dict(range=[0.0, 1.0]),
+        height=400
+    )
+    return fig
+def create_performance_over_time(evaluations: List[Dict]) -> go.Figure:
+    """
+    Create a time series plot of performance metrics.
+    Args:
+        evaluations: List of evaluation results
+    Returns:
+        Plotly figure for performance over time
+    """
+    if not evaluations:
+        return go.Figure()
+    # Convert to DataFrame
+    df = pd.DataFrame(evaluations)
+    df['news_date'] = pd.to_datetime(df['news_date'])
+    df = df.sort_values('news_date')
+    # Calculate rolling metrics
+    window = min(5, len(df))  # 5-day rolling window or less
+    df['rolling_das'] = df['das_score'].rolling(window=window, min_periods=1).mean()
+    df['rolling_accuracy'] = df['is_correct'].rolling(window=window, min_periods=1).mean()
+    fig = go.Figure()
+    # DAS Score over time
+    fig.add_trace(go.Scatter(
+        x=df['news_date'],
+        y=df['rolling_das'],
+        mode='lines+markers',
+        name='Rolling DAS Score',
+        line=dict(color='blue'),
+        yaxis='y'
+    ))
+    # Accuracy over time
+    fig.add_trace(go.Scatter(
+        x=df['news_date'],
+        y=df['rolling_accuracy'],
+        mode='lines+markers',
+        name='Rolling Accuracy',
+        line=dict(color='red'),
+        yaxis='y2'
+    ))
+    fig.update_layout(
+        title=f'Performance Over Time (Rolling {window}-day average)',
+        xaxis_title='Date',
+        yaxis=dict(
+            title='DAS Score',
+            side='left',
+            range=[0, 1]
+        ),
+        yaxis2=dict(
+            title='Accuracy',
+            side='right',
+            overlaying='y',
+            range=[0, 1]
+        ),
+        height=400,
+        hovermode='x unified'
+    )
+    return fig
+def create_sentiment_distribution(evaluations: List[Dict]) -> go.Figure:
+    """
+    Create a distribution plot of sentiments and their performance.
+    Args:
+        evaluations: List of evaluation results
+    Returns:
+        Plotly figure for sentiment distribution
+    """
+    if not evaluations:
+        return go.Figure()
+    df = pd.DataFrame(evaluations)
+    # Group by sentiment
+    sentiment_stats = df.groupby('sentiment').agg({
+        'das_score': ['mean', 'count'],
+        'is_correct': 'mean',
+        'confidence': 'mean'
+    }).round(3)
+    sentiment_stats.columns = ['avg_das', 'count', 'accuracy', 'avg_confidence']
+    sentiment_stats = sentiment_stats.reset_index()
+    # Create subplot
+    fig = go.Figure()
+    # Bar chart for counts
+    fig.add_trace(go.Bar(
+        x=sentiment_stats['sentiment'],
+        y=sentiment_stats['count'],
+        name='Count',
+        marker_color='lightblue',
+        yaxis='y',
+        text=sentiment_stats['count'],
+        textposition='auto'
+    ))
+    # Line chart for accuracy
+    fig.add_trace(go.Scatter(
+        x=sentiment_stats['sentiment'],
+        y=sentiment_stats['accuracy'],
+        mode='lines+markers',
+        name='Accuracy',
+        line=dict(color='red'),
+        yaxis='y2',
+        marker=dict(size=10)
+    ))
+    fig.update_layout(
+        title='Sentiment Distribution and Performance',
+        xaxis_title='Sentiment',
+        yaxis=dict(
+            title='Count',
+            side='left'
+        ),
+        yaxis2=dict(
+            title='Accuracy',
+            side='right',
+            overlaying='y',
+            range=[0, 1]
+        ),
+        height=400
+    )
+    return fig
+def create_confidence_impact_scatter(evaluations: List[Dict]) -> go.Figure:
+    """
+    Create a scatter plot of confidence vs impact with DAS score coloring.
+    Args:
+        evaluations: List of evaluation results
+    Returns:
+        Plotly figure for confidence-impact scatter
+    """
+    if not evaluations:
+        return go.Figure()
+    df = pd.DataFrame(evaluations)
+    # Create scatter plot
+    fig = px.scatter(
+        df,
+        x='confidence',
+        y='impact',
+        color='das_score',
+        size='wat_weight',
+        hover_data=['ticker', 'sentiment', 'return_24h'],
+        color_continuous_scale='RdYlBu_r',
+        title='Confidence vs Impact (colored by DAS Score)'
+    )
+    fig.update_layout(
+        xaxis_title='Confidence',
+        yaxis_title='Impact (|Return %|)',
+        height=400
+    )
+    return fig
+def display_advanced_visualizations(evaluations: List[Dict]):
+    """
+    Display advanced visualization components in Streamlit.
+    Args:
+        evaluations: List of evaluation results
+    """
+    if len(evaluations) < 3:
+        st.info("Need at least 3 evaluations for advanced visualizations.")
+        return
+    st.subheader("📊 Advanced Analytics")
+    # Create tabs for different visualizations
+    tab1, tab2, tab3, tab4 = st.tabs([
+        "Calibration", "Performance Over Time",
+        "Sentiment Analysis", "Confidence vs Impact"
+    ])
+    with tab1:
+        st.plotly_chart(
+            create_calibration_plot(evaluations),
+            use_container_width=True
+        )
+        st.caption("Shows how well confidence scores align with actual accuracy. Points closer to the diagonal line indicate better calibration.")
+    with tab2:
+        st.plotly_chart(
+            create_performance_over_time(evaluations),
+            use_container_width=True
+        )
+        st.caption("Rolling average of DAS scores and accuracy over time.")
+    with tab3:
+        st.plotly_chart(
+            create_sentiment_distribution(evaluations),
+            use_container_width=True
+        )
+        st.caption("Distribution of sentiment predictions and their respective performance.")
+    with tab4:
+        st.plotly_chart(
+            create_confidence_impact_scatter(evaluations),
+            use_container_width=True
+        )
+        st.caption("Relationship between model confidence and market impact, colored by DAS score.")