Spaces:
Sleeping
Sleeping
| import torch | |
| import gradio as gr | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| from transformers import pipeline | |
| import numpy as np | |
| from tqdm.auto import tqdm | |
| import warnings | |
| import os | |
| from datetime import datetime, timedelta | |
| from scipy.stats import pearsonr | |
| import ast | |
| warnings.simplefilter(action='ignore', category=FutureWarning) | |
| DEVELOPER_NAME = "汪于捷、李哲弘、黃千宥、陳奕瑄、洪寓澤" | |
| NEWS_CSV_PATH = 'cryptonews.csv' | |
| BTC_CSV_PATH = 'BTC.csv' | |
| PROCESSED_DATA_PATH = 'processed_btc_sentiment_data.csv' | |
| PLOTLY_TEMPLATE = "plotly_dark" | |
| SENTIMENT_PIPELINE = None | |
| def initialize_pipeline(): | |
| """載入情緒分析模型,只在需要時執行一次。""" | |
| global SENTIMENT_PIPELINE | |
| if SENTIMENT_PIPELINE is None: | |
| try: | |
| print("⏳ 正在載入情緒分析模型 (Hugging Face)...") | |
| MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment" | |
| SENTIMENT_PIPELINE = pipeline( | |
| "sentiment-analysis", model=MODEL_NAME, tokenizer=MODEL_NAME, device=-1 | |
| ) | |
| print("✅ 模型載入成功!") | |
| except Exception as e: | |
| print(f"❌ 載入模型時發生錯誤: {e}") | |
| SENTIMENT_PIPELINE = None | |
| def safe_literal_eval(val): | |
| """安全地解析字串,如果失敗則回傳空字典。""" | |
| try: | |
| return ast.literal_eval(val) | |
| except (ValueError, SyntaxError): | |
| return {} | |
| def preprocess_and_cache_data(): | |
| """ | |
| 執行一次性的資料預處理,分析來源為新聞標題(title)與內文(text)的組合。 | |
| """ | |
| if not os.path.exists(NEWS_CSV_PATH) or not os.path.exists(BTC_CSV_PATH): | |
| raise FileNotFoundError(f"請確認 '{NEWS_CSV_PATH}' 和 '{BTC_CSV_PATH}' 檔案存在。") | |
| initialize_pipeline() | |
| if SENTIMENT_PIPELINE is None: | |
| raise RuntimeError("情緒分析模型未能成功初始化。") | |
| print(f"⏳ 正在讀取原始資料: '{NEWS_CSV_PATH}'...") | |
| news_df = pd.read_csv(NEWS_CSV_PATH) | |
| news_df.dropna(subset=['title', 'text', 'sentiment'], inplace=True) | |
| news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce').dt.date | |
| news_df.dropna(subset=['date'], inplace=True) | |
| print("⏳ 正在合併新聞標題與內文...") | |
| news_df['full_text'] = news_df['title'] + ". " + news_df['text'] | |
| print("⏳ 正在對新聞完整內容 (標題+內文) 進行模型情緒分析...") | |
| texts_to_analyze = news_df['full_text'].tolist() | |
| sentiments_model = SENTIMENT_PIPELINE( | |
| texts_to_analyze, | |
| batch_size=256, | |
| truncation=True, | |
| max_length=512 | |
| ) | |
| score_map_model = {'LABEL_2': 1, 'LABEL_1': 0, 'LABEL_0': -1} | |
| news_df['model_sentiment_score'] = [score_map_model.get(s['label'], 0) for s in sentiments_model] | |
| print("⏳ 正在解析預存的情緒欄位 (class, polarity, subjectivity)...") | |
| sentiment_dicts = news_df['sentiment'].apply(safe_literal_eval) | |
| class_score_map = {'positive': 1, 'neutral': 0, 'negative': -1} | |
| news_df['class_sentiment_score'] = sentiment_dicts.apply(lambda x: class_score_map.get(x.get('class', 'neutral'), 0)) | |
| news_df['polarity'] = sentiment_dicts.apply(lambda x: x.get('polarity', 0.0)) | |
| news_df['subjectivity'] = sentiment_dicts.apply(lambda x: x.get('subjectivity', 0.0)) | |
| print("⏳ 正在計算每日平均情緒指標...") | |
| daily_metrics = news_df.groupby('date').agg( | |
| avg_model_sentiment=('model_sentiment_score', 'mean'), | |
| avg_class_sentiment=('class_sentiment_score', 'mean'), | |
| avg_polarity=('polarity', 'mean'), | |
| avg_subjectivity=('subjectivity', 'mean') | |
| ).reset_index() | |
| print(f"⏳ 正在讀取比特幣價格資料: '{BTC_CSV_PATH}'...") | |
| btc_df = pd.read_csv(BTC_CSV_PATH) | |
| btc_df['date'] = pd.to_datetime(btc_df['date'], errors='coerce').dt.date | |
| btc_df['price_change_pct'] = btc_df['close'].pct_change() * 100 | |
| print("⏳ 正在合併所有資料...") | |
| daily_metrics['date'] = pd.to_datetime(daily_metrics['date']) | |
| btc_df['date'] = pd.to_datetime(btc_df['date']) | |
| merged_df = pd.merge(btc_df, daily_metrics, on='date', how='inner') | |
| news_content_df = news_df.groupby('date').agg( | |
| titles=('title', list), | |
| texts=('text', list) | |
| ).reset_index() | |
| news_content_df['date'] = pd.to_datetime(news_content_df['date']) | |
| final_df = pd.merge(merged_df, news_content_df, on='date', how='left') | |
| print(f"✅ 資料預處理完成!正在將結果儲存至 '{PROCESSED_DATA_PATH}'...") | |
| final_df.to_csv(PROCESSED_DATA_PATH, index=False) | |
| return final_df | |
| def load_data(): | |
| """載入資料,若快取不存在則執行預處理。""" | |
| if os.path.exists(PROCESSED_DATA_PATH): | |
| print(f"✅ 發現已處理的資料快取,正在從 '{PROCESSED_DATA_PATH}' 載入...") | |
| df = pd.read_csv(PROCESSED_DATA_PATH) | |
| df['date'] = pd.to_datetime(df['date']) | |
| df['titles'] = df['titles'].apply(ast.literal_eval) | |
| df['texts'] = df['texts'].apply(ast.literal_eval) | |
| return df | |
| else: | |
| print("⚠️ 未發現已處理的資料,將執行首次預處理...") | |
| return preprocess_and_cache_data() | |
| df = load_data() | |
| # 確保資料按日期排序 | |
| df.sort_values(by='date', inplace=True) | |
| df.set_index('date', inplace=True) | |
| def get_filtered_df(start_date, end_date): | |
| """根據日期範圍篩選 DataFrame。""" | |
| if start_date is None or end_date is None: | |
| return pd.DataFrame() | |
| return df[(df.index >= pd.to_datetime(start_date)) & (df.index <= pd.to_datetime(end_date))].copy() | |
| def plot_price_and_sentiment(filtered_df, sentiment_col, sentiment_name, color): | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=filtered_df.index, y=filtered_df['close'], name='BTC 收盤價', line=dict(color='deepskyblue'), yaxis='y1')) | |
| fig.add_trace(go.Scatter(x=filtered_df.index, y=filtered_df[sentiment_col], name=sentiment_name, line=dict(color=color, dash='dash'), yaxis='y2')) | |
| fig.update_layout( | |
| # title=f'📈 比特幣價格 vs. {sentiment_name}趨勢', | |
| xaxis_title='日期', | |
| yaxis=dict(title='價格 (USD)', color='deepskyblue'), | |
| yaxis2=dict(title='情緒分數', overlaying='y', side='right', color=color, range=[-1, 1]), | |
| legend=dict(x=0.01, y=0.99, orientation='h'), | |
| template=PLOTLY_TEMPLATE, | |
| paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0.2)' | |
| ) | |
| return fig | |
| def plot_subjectivity_trend(filtered_df): | |
| fig = go.Figure() | |
| fig.add_trace(go.Scatter(x=filtered_df.index, y=filtered_df['avg_subjectivity'], name='每日新聞主觀性', line=dict(color='lightgreen'))) | |
| fig.update_layout( | |
| # title='🧐 每日新聞主觀性趨勢', | |
| xaxis_title='日期', | |
| yaxis=dict(title='主觀性分數 (0=客觀, 1=主觀)', color='lightgreen', range=[0, 1]), | |
| template=PLOTLY_TEMPLATE, | |
| paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0.2)' | |
| ) | |
| return fig | |
| def plot_correlation(filtered_df, sentiment_col, lag_days): | |
| df_corr = filtered_df[[sentiment_col, 'price_change_pct']].copy() | |
| df_corr['price_change_pct_lagged'] = df_corr['price_change_pct'].shift(-lag_days) | |
| df_corr.dropna(inplace=True) | |
| if df_corr.empty or len(df_corr) < 2: | |
| correlation, p_value = 0, 1 | |
| else: | |
| correlation, p_value = pearsonr(df_corr[sentiment_col], df_corr['price_change_pct_lagged']) | |
| fig = go.Figure(data=go.Scatter(x=df_corr[sentiment_col], y=df_corr['price_change_pct_lagged'], mode='markers', marker=dict(color='mediumpurple', opacity=0.7))) | |
| fig.update_layout( | |
| title=f'🔗 情緒與 {lag_days} 天後價格變化的關聯性 (相關係數: {correlation:.3f})', | |
| xaxis_title='每日平均情緒分數', yaxis_title=f'{lag_days} 天後價格變化 (%)', | |
| template=PLOTLY_TEMPLATE, | |
| paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0.2)' | |
| ) | |
| return fig, correlation, p_value | |
| def get_top_bottom_news(date_obj): | |
| """ | |
| 獲取指定日期的最正面與最負面新聞。 | |
| """ | |
| date_ts = pd.to_datetime(date_obj) | |
| if date_ts not in df.index: | |
| return "<ul><li>無此日期資料</li></ul>", "<ul><li>無此日期資料</li></ul>" | |
| day_data = df.loc[date_ts] | |
| titles, texts = day_data.get('titles', []), day_data.get('texts', []) | |
| initialize_pipeline() | |
| if SENTIMENT_PIPELINE is None or not isinstance(titles, list) or not isinstance(texts, list) or len(titles) != len(texts): | |
| return "<ul><li>模型未載入或新聞資料格式錯誤</li></ul>", "<ul><li>模型未載入或新聞資料格式錯誤</li></ul>" | |
| full_texts_for_day = [f"{title}. {text}" for title, text in zip(titles, texts)] | |
| if not full_texts_for_day: | |
| return "<ul><li>當日無新聞</li></ul>", "<ul><li>當日無新聞</li></ul>" | |
| sentiments = SENTIMENT_PIPELINE(full_texts_for_day, batch_size=8, truncation=True, max_length=512) | |
| score_map = {'LABEL_2': 1, 'LABEL_1': 0, 'LABEL_0': -1} | |
| scored_titles = [] | |
| for i, sentiment in enumerate(sentiments): | |
| directional_score = score_map.get(sentiment['label'], 0) * sentiment['score'] | |
| scored_titles.append((titles[i], directional_score)) | |
| positive_news = sorted([item for item in scored_titles if item[1] > 0], key=lambda x: x[1], reverse=True) | |
| negative_news = sorted([item for item in scored_titles if item[1] < 0], key=lambda x: x[1], reverse=False) | |
| if positive_news: | |
| top_news_html = "".join([f"<li>{title}</li>" for title, score in positive_news[:3]]) | |
| else: | |
| top_news_html = "<li>當日無正面情緒新聞</li>" | |
| if negative_news: | |
| bottom_news_html = "".join([f"<li>{title}</li>" for title, score in negative_news[:3]]) | |
| else: | |
| bottom_news_html = "<li>當日無負面情緒新聞</li>" | |
| return f"<ul>{top_news_html}</ul>", f"<ul>{bottom_news_html}</ul>" | |
| with gr.Blocks( | |
| theme=gr.themes.Soft( | |
| primary_hue="sky", | |
| secondary_hue="orange", | |
| font=["Arial", "sans-serif"] | |
| ), | |
| js=""" | |
| function refresh() { | |
| const url = new URL(window.location); | |
| if (url.searchParams.get('__theme') !== 'dark') { | |
| url.searchParams.set('__theme', 'dark'); | |
| window.location.href = url.href; | |
| } | |
| } | |
| """ | |
| ) as app: | |
| gr.Markdown(f"""<div style='text-align: center; padding: 20px; color: white;'><h1 style='font-size: 3em; color: #00BFFF;'>📈 Crypto Pulse</h1><p style='font-size: 1.2em; color: #A9A9A9;'>比特幣新聞情緒與價格分析儀表板</p><p style='font-size: 0.9em; color: #888;'>Designed by: {DEVELOPER_NAME}</p></div>""") | |
| max_date_dt = df.index.max() | |
| # 確保資料數足夠 | |
| if len(df) > 360: | |
| min_date_dt = df.index[-360] | |
| else: | |
| min_date_dt = df.index.min() | |
| with gr.Row(): | |
| start_date_input = gr.DateTime(label="📅 開始日期", type="datetime", value=min_date_dt) | |
| end_date_input = gr.DateTime(label="📅 結束日期", type="datetime", value=max_date_dt) | |
| with gr.Tabs() as tabs: | |
| with gr.TabItem("📊 模型情緒總覽", id=0): | |
| plot_overview = gr.Plot(label="模型情緒 vs. 價格趨勢圖") | |
| gr.Markdown("此圖展示了由 `twitter-roberta-base-sentiment` 模型分析出的**新聞內容(標題+內文)**情緒分數(右軸)與比特幣價格(左軸)的對比。") | |
| with gr.TabItem("🔬 多維度情緒分析", id=1): | |
| gr.Markdown(""" | |
| ### 指標說明 | |
| 此處的情緒指標來自資料集 `cryptonews.csv` 中預先計算好的 `sentiment` 欄位。 | |
| * **資料集預設情緒分類**: 將資料集內建的 `positive`, `neutral`, `negative` 類別轉換為 `1, 0, -1` 的數值分數。 | |
| * **情感極性 (Polarity)**: 衡量文本的正面或負面程度。值域從 -1 (非常負面) 到 +1 (非常正面)。 | |
| * **主觀性 (Subjectivity)**: 衡量文本是偏向客觀事實還是主觀意見。值域從 0 (非常客觀) 到 1 (非常主觀)。 | |
| """) | |
| plot_class_sentiment = gr.Plot(label="資料集預設情緒 vs. 價格趨勢圖") | |
| plot_polarity = gr.Plot(label="情感極性 vs. 價格趨勢圖") | |
| plot_subjectivity = gr.Plot(label="新聞主觀性趨勢圖") | |
| with gr.TabItem("🔍 關聯性深掘", id=2): | |
| with gr.Row(): | |
| with gr.Column(scale=2, min_width=250): | |
| sentiment_type_radio = gr.Radio( | |
| ["模型情緒分數", "資料集預設情緒分類", "情感極性 (Polarity)"], | |
| label="選擇分析的情緒指標", value="模型情緒分數" | |
| ) | |
| lag_slider = gr.Slider(minimum=0, maximum=14, value=1, step=1, label="🕒 情緒延遲天數 (Lag Days)") | |
| correlation_output = gr.Textbox(label="Pearson 相關係數", interactive=False) | |
| p_value_output = gr.Textbox(label="P-Value", interactive=False) | |
| with gr.Column(scale=3): | |
| plot_corr = gr.Plot(label="情緒 vs. 價格變化 散點圖") | |
| with gr.TabItem("📰 新聞瀏覽器", id=3): | |
| gr.Markdown("在此處選擇特定日期,即可查看當天的熱點新聞。") | |
| news_date_input = gr.DateTime(label="🗓️ 選擇查詢日期", type="datetime", value=max_date_dt) | |
| with gr.Row(): | |
| gr.Markdown("### 👍 當日最正面新聞 Top 3"); gr.Markdown("### 👎 當日最負面新聞 Top 3") | |
| with gr.Row(): | |
| top_news_output = gr.HTML(); bottom_news_output = gr.HTML() | |
| def update_all(start_date, end_date, lag_days, sentiment_type): | |
| if start_date is None or end_date is None or start_date > end_date: | |
| gr.Warning("請選擇有效的開始與結束日期。") | |
| empty_fig = go.Figure() | |
| return empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, "N/A", "N/A" | |
| start_date, end_date = pd.to_datetime(start_date), pd.to_datetime(end_date) | |
| filtered_df = get_filtered_df(start_date, end_date) | |
| if filtered_df.empty: | |
| gr.Warning("此日期範圍內無資料,請擴大範圍。") | |
| empty_fig = go.Figure() | |
| return empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, "N/A", "N/A" | |
| overview_fig = plot_price_and_sentiment(filtered_df, 'avg_model_sentiment', '模型情緒分數', 'crimson') | |
| class_sentiment_fig = plot_price_and_sentiment(filtered_df, 'avg_class_sentiment', '資料集預設情緒分類', 'yellow') | |
| polarity_fig = plot_price_and_sentiment(filtered_df, 'avg_polarity', '情感極性 (Polarity)', 'orange') | |
| subjectivity_fig = plot_subjectivity_trend(filtered_df) | |
| if sentiment_type == "模型情緒分數": | |
| sentiment_col = 'avg_model_sentiment' | |
| elif sentiment_type == "資料集預設情緒分類": | |
| sentiment_col = 'avg_class_sentiment' | |
| else: # Polarity | |
| sentiment_col = 'avg_polarity' | |
| corr_fig, corr_val, p_val = plot_correlation(filtered_df, sentiment_col, lag_days) | |
| return overview_fig, class_sentiment_fig, polarity_fig, subjectivity_fig, corr_fig, f"{corr_val:.4f}", f"{p_val:.4f}" | |
| def update_news_browser(date_obj): | |
| if date_obj is None: | |
| return "請選擇日期", "無" | |
| top_news, bottom_news = get_top_bottom_news(date_obj) | |
| return top_news, bottom_news | |
| inputs_for_main_update = [start_date_input, end_date_input, lag_slider, sentiment_type_radio] | |
| outputs_for_main_update = [plot_overview, plot_class_sentiment, plot_polarity, plot_subjectivity, plot_corr, correlation_output, p_value_output] | |
| for component in [start_date_input, end_date_input, lag_slider, sentiment_type_radio]: | |
| component.change(fn=update_all, inputs=inputs_for_main_update, outputs=outputs_for_main_update) | |
| news_date_input.change( | |
| fn=update_news_browser, | |
| inputs=[news_date_input], | |
| outputs=[top_news_output, bottom_news_output] | |
| ) | |
| def load_app(): | |
| main_outputs = update_all(min_date_dt, max_date_dt, 1, "模型情緒分數") | |
| news_outputs = update_news_browser(max_date_dt) | |
| return main_outputs + news_outputs | |
| app.load( | |
| fn=load_app, | |
| inputs=None, | |
| outputs=outputs_for_main_update + [top_news_output, bottom_news_output] | |
| ) | |
| app.launch(debug=False, share=True, show_error=True, show_api=False) |