Spaces:

Starberry15
/

data_analysis

Sleeping

App Files Files Community

Starberry15 commited on 27 days ago

Commit

8ee580a

verified ·

1 Parent(s): 1eb934c

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +256 -128

src/streamlit_app.py CHANGED Viewed

@@ -1,140 +1,268 @@
-# streamlit_data_analysis_app.py
-# Streamlit Data Analysis App with LLM-powered cleaning and insights
-# Features:
-# - Upload CSV / Excel
-# - Dataset cleaned automatically by Qwen 2.5 Coder
-# - Preprocessing, visualizations, summaries
-# - Insights via Mistral, Mixtral, Qwen 14B, Gemini
 import os
-import streamlit as st
 import pandas as pd
-import matplotlib.pyplot as plt
-import seaborn as sns
-from huggingface_hub import InferenceClient
-import google.generativeai as genai
-# ---------- CONFIG ----------
-st.set_page_config(page_title="LLM-Powered Data Analysis", layout="wide")
-# ---------- API KEYS ----------
-try:
-    GEMINI_API_KEY = st.secrets.get("GEMINI_API_KEY")
-except:
-    GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
-HF_API_KEY = st.secrets.get("HF_API_KEY") or os.getenv("HF_API_KEY")
-if GEMINI_API_KEY:
-    genai.configure(api_key=GEMINI_API_KEY)
-hf_client = InferenceClient(token=HF_API_KEY) if HF_API_KEY else None
-# ---------- UTILITIES ----------
-def read_file(uploaded_file):
-    name = uploaded_file.name.lower()
-    if name.endswith(('.csv', '.txt')):
-        return pd.read_csv(uploaded_file)
-    elif name.endswith(('.xls', '.xlsx')):
-        return pd.read_excel(uploaded_file)
-    else:
-        raise ValueError("Unsupported file type. Please upload CSV or Excel.")
-def call_hf_model(prompt: str, model: str):
-    """Call Hugging Face inference API"""
-    if not hf_client:
-        return "⚠️ HF API key not found."
     try:
-        output = hf_client.text_generation(model=model, inputs=prompt, max_new_tokens=1024)
-        return output[0]["generated_text"]
     except Exception as e:
-        return f"❌ HF call failed: {e}"
-def call_gemini(prompt: str, model="gemini-2.0-flash"):
-    if not GEMINI_API_KEY:
-        return "⚠️ Gemini API key not found."
     try:
-        model_obj = genai.GenerativeModel(model)
-        response = model_obj.generate_content(prompt)
-        return response.text
     except Exception as e:
-        return f"❌ Gemini call failed: {e}"
-# ---------- STREAMLIT UI ----------
-st.title("📊 LLM-Powered Data Analysis App")
-st.markdown("Upload a dataset and let AI clean & analyze it automatically!")
-# Sidebar options
-with st.sidebar:
-    st.header("⚙️ Options")
-    cleaner_model = st.selectbox("Dataset Cleaner", ["Qwen-2.5-coder"])
-    analysis_model = st.selectbox("Analysis / Insights Model", ["mistralai/Mistral-7B-Instruct", "mixtral/Mixtral-8B", "Qwen-14B"])
-    use_gemini = st.checkbox("Enable Gemini Insights", value=False)
-uploaded_file = st.file_uploader("📂 Upload CSV or Excel file", type=['csv', 'xls', 'xlsx', 'txt'])
-if uploaded_file:
-    # Save file to /tmp for Spaces
-    temp_path = os.path.join("/tmp", uploaded_file.name)
-    with open(temp_path, "wb") as f:
-        f.write(uploaded_file.getbuffer())
-    with open(temp_path, "rb") as f:
-        raw_df = read_file(f)
-    st.subheader("Raw Data Preview")
-    st.dataframe(raw_df.head())
-    # ---------- DATA CLEANING ----------
-    st.subheader("Cleaning dataset with AI...")
-    prompt_clean = f"Clean the following dataset and return a valid CSV. Only return CSV text. Input:\n{raw_df.to_csv(index=False)}"
-    cleaned_csv_text = call_hf_model(prompt_clean, model=cleaner_model)
-    from io import StringIO
-    cleaned_df = pd.read_csv(StringIO(cleaned_csv_text))
-    st.success("✅ Dataset cleaned!")
-    st.dataframe(cleaned_df.head())
-    # ---------- SUMMARY ----------
-    st.subheader("Dataset Summary")
-    st.write(f"Shape: {cleaned_df.shape}")
-    st.dataframe(cleaned_df.describe(include='all'))
-    # ---------- VISUALIZATIONS ----------
-    st.subheader("Visualizations")
-    viz_col = st.selectbox("Select column", options=cleaned_df.columns)
-    viz_type = st.selectbox("Visualization type", ['Histogram', 'Boxplot', 'Bar (categorical)', 'Scatter', 'Correlation heatmap'])
-    if viz_type == 'Scatter':
-        second_col = st.selectbox("Second column", options=[c for c in cleaned_df.columns if c != viz_col])
-    if st.button("Show Visualization"):
-        fig, ax = plt.subplots(figsize=(8, 5))
-        try:
-            if viz_type == 'Histogram':
-                sns.histplot(cleaned_df[viz_col], kde=True, ax=ax)
-            elif viz_type == 'Boxplot':
-                sns.boxplot(x=cleaned_df[viz_col], ax=ax)
-            elif viz_type == 'Bar (categorical)':
-                counts = cleaned_df[viz_col].astype(str).value_counts().head(20)
-                sns.barplot(x=counts.values, y=counts.index, ax=ax)
-            elif viz_type == 'Scatter':
-                sns.scatterplot(x=cleaned_df[viz_col], y=cleaned_df[second_col], ax=ax)
-            elif viz_type == 'Correlation heatmap':
-                corr = cleaned_df.select_dtypes(include=['number']).corr()
-                sns.heatmap(corr, annot=True, cmap='coolwarm', ax=ax)
-            st.pyplot(fig)
-        except Exception as e:
-            st.error(f"Visualization failed: {e}")
-    # ---------- INSIGHTS ----------
-    st.subheader("🧠 AI Insights")
-    user_q = st.text_area("Enter your question (optional):")
-    if st.button("Get AI Insights"):
-        prompt_analysis = f"Dataset:\n{cleaned_df.to_csv(index=False)}\nQuestion: {user_q if user_q else 'Provide a summary and key patterns.'}"
-        if use_gemini:
-            resp = call_gemini(prompt_analysis)
         else:
-            resp = call_hf_model(prompt_analysis, model=analysis_model)
-        st.write(resp)
 else:
-    st.info("📥 Upload a dataset to begin.")

 import os
 import pandas as pd
+import numpy as np
+import streamlit as st
+import plotly.express as px
+import plotly.figure_factory as ff
+from dotenv import load_dotenv
+from huggingface_hub import InferenceClient, login
+from io import StringIO
+# ======================================================
+# ⚙️ APP CONFIGURATION
+# ======================================================
+st.set_page_config(page_title="📊 Smart Data Analyst Pro", layout="wide")
+st.title("📊 Smart Data Analyst Pro")
+st.caption("AI that cleans, analyzes, and visualizes your data — powered by Hugging Face Inference API.")
+# ======================================================
+# 🔐 Load Environment Variables
+# ======================================================
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
+if not HF_TOKEN:
+    st.error("❌ Missing HF_TOKEN. Please set it in your .env file.")
+else:
+    login(token=HF_TOKEN)
+# ======================================================
+# 🧠 MODEL SETUP
+# ======================================================
+with st.sidebar:
+    st.header("⚙️ Model Settings")
+    CLEANER_MODEL = st.selectbox(
+        "Select Cleaner Model:",
+        [
+            "Qwen/Qwen2.5-Coder-7B-Instruct",
+            "meta-llama/Meta-Llama-3-8B-Instruct",
+            "microsoft/Phi-3-mini-4k-instruct",
+            "mistralai/Mistral-7B-Instruct-v0.3"
+        ],
+        index=0
+    )
+    ANALYST_MODEL = st.selectbox(
+        "Select Analysis Model:",
+        [
+            "Qwen/Qwen2.5-14B-Instruct",
+            "mistralai/Mistral-7B-Instruct-v0.3",
+            "HuggingFaceH4/zephyr-7b-beta"
+        ],
+        index=0
+    )
+    temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
+    max_tokens = st.slider("Max Tokens", 128, 2048, 512)
+# Initialize inference clients
+cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
+analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
+# ======================================================
+# 🧩 SAFE GENERATION FUNCTION
+# ======================================================
+def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
+    """
+    Tries text_generation first, then falls back to chat_completion if not supported.
+    Returns plain string content.
+    """
     try:
+        resp = client.text_generation(
+            prompt,
+            temperature=temperature,
+            max_new_tokens=max_tokens,
+            return_full_text=False,
+        )
+        return resp.strip()
     except Exception as e:
+        if "Supported task: conversational" in str(e) or "not supported" in str(e):
+            chat_resp = client.chat_completion(
+                messages=[{"role": "user", "content": prompt}],
+                max_tokens=max_tokens,
+                temperature=temperature,
+            )
+            return chat_resp["choices"][0]["message"]["content"].strip()
+        else:
+            raise e
+# ======================================================
+# 🧩 SMART DATA CLEANING
+# ======================================================
+def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
+    """Backup rule-based cleaner."""
+    df = df.copy()
+    df.dropna(axis=1, how="all", inplace=True)
+    df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
+    for col in df.columns:
+        if df[col].dtype == "O":
+            if not df[col].mode().empty:
+                df[col].fillna(df[col].mode()[0], inplace=True)
+            else:
+                df[col].fillna("Unknown", inplace=True)
+        else:
+            df[col].fillna(df[col].median(), inplace=True)
+    df.drop_duplicates(inplace=True)
+    return df
+def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
+    """Cleans the dataset using the selected AI model. Falls back gracefully if the model fails."""
+    raw_preview = df.head(5).to_csv(index=False)
+    prompt = f"""
+You are a professional data cleaning assistant.
+Clean and standardize the dataset below dynamically:
+1. Handle missing values
+2. Fix column name inconsistencies
+3. Convert data types (dates, numbers, categories)
+4. Remove irrelevant or duplicate rows
+Return ONLY a valid CSV text (no markdown, no explanations).
+--- RAW SAMPLE ---
+{raw_preview}
+"""
     try:
+        cleaned_str = safe_hf_generate(cleaner_client, prompt, temperature=0.1, max_tokens=1024)
     except Exception as e:
+        st.warning(f"⚠️ AI cleaning failed: {e}")
+        return fallback_clean(df)
+    cleaned_str = (
+        cleaned_str.replace("```csv", "")
+        .replace("```", "")
+        .replace("###", "")
+        .replace(";", ",")
+        .strip()
+    )
+    lines = cleaned_str.splitlines()
+    lines = [line for line in lines if "," in line and not line.lower().startswith(("note", "summary"))]
+    cleaned_str = "\n".join(lines)
+    try:
+        cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
+        cleaned_df = cleaned_df.dropna(axis=1, how="all")
+        cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
+        return cleaned_df
+    except Exception as e:
+        st.warning(f"⚠️ AI CSV parse failed: {e}")
+        return fallback_clean(df)
+def summarize_dataframe(df: pd.DataFrame) -> str:
+    """Generate a concise summary of the dataframe."""
+    lines = [f"Rows: {len(df)} | Columns: {len(df.columns)}", "Column summaries:"]
+    for col in df.columns[:10]:
+        non_null = int(df[col].notnull().sum())
+        if pd.api.types.is_numeric_dtype(df[col]):
+            desc = df[col].describe().to_dict()
+            mean = float(desc.get("mean", np.nan))
+            median = float(df[col].median()) if non_null > 0 else None
+            lines.append(f"- {col}: mean={mean:.3f}, median={median}, non_null={non_null}")
+        else:
+            top = df[col].value_counts().head(3).to_dict()
+            lines.append(f"- {col}: top_values={top}, non_null={non_null}")
+    return "\n".join(lines)
+def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
+    """Send the dataframe and user query to the analysis model for interpretation."""
+    df_summary = summarize_dataframe(df)
+    sample = df.head(6).to_csv(index=False)
+    prompt = f"""
+You are a professional data analyst.
+Analyze the dataset '{dataset_name}' and answer the user's question.
+--- SUMMARY ---
+{df_summary}
+--- SAMPLE DATA ---
+{sample}
+--- USER QUESTION ---
+{user_query}
+Respond with:
+1. Key insights and patterns
+2. Quantitative findings
+3. Notable relationships or anomalies
+4. Data-driven recommendations
+"""
+    try:
+        response = safe_hf_generate(analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
+        return response
+    except Exception as e:
+        return f"⚠️ Analysis failed: {e}"
+# ======================================================
+# 🚀 MAIN APP LOGIC
+# ======================================================
+uploaded = st.file_uploader("📎 Upload CSV or Excel file", type=["csv", "xlsx"])
+if uploaded:
+    df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
+    with st.spinner("🧼 AI Cleaning your dataset..."):
+        cleaned_df = ai_clean_dataset(df)
+    st.subheader("✅ Cleaned Dataset Preview")
+    st.dataframe(cleaned_df.head(), use_container_width=True)
+    with st.expander("📋 Cleaning Summary", expanded=False):
+        st.text(summarize_dataframe(cleaned_df))
+    with st.expander("📈 Quick Visualizations", expanded=True):
+        numeric_cols = cleaned_df.select_dtypes(include="number").columns.tolist()
+        categorical_cols = cleaned_df.select_dtypes(exclude="number").columns.tolist()
+        viz_type = st.selectbox(
+            "Visualization Type",
+            ["Scatter Plot", "Histogram", "Box Plot", "Correlation Heatmap", "Categorical Count"]
+        )
+        if viz_type == "Scatter Plot" and len(numeric_cols) >= 2:
+            x = st.selectbox("X-axis", numeric_cols)
+            y = st.selectbox("Y-axis", numeric_cols, index=min(1, len(numeric_cols)-1))
+            color = st.selectbox("Color", ["None"] + categorical_cols)
+            fig = px.scatter(cleaned_df, x=x, y=y, color=None if color=="None" else color)
+            st.plotly_chart(fig, use_container_width=True)
+        elif viz_type == "Histogram" and numeric_cols:
+            col = st.selectbox("Column", numeric_cols)
+            fig = px.histogram(cleaned_df, x=col, nbins=30)
+            st.plotly_chart(fig, use_container_width=True)
+        elif viz_type == "Box Plot" and numeric_cols:
+            col = st.selectbox("Column", numeric_cols)
+            fig = px.box(cleaned_df, y=col)
+            st.plotly_chart(fig, use_container_width=True)
+        elif viz_type == "Correlation Heatmap" and len(numeric_cols) > 1:
+            corr = cleaned_df[numeric_cols].corr()
+            fig = ff.create_annotated_heatmap(
+                z=corr.values,
+                x=list(corr.columns),
+                y=list(corr.index),
+                annotation_text=corr.round(2).values,
+                showscale=True
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        elif viz_type == "Categorical Count" and categorical_cols:
+            cat = st.selectbox("Category", categorical_cols)
+            fig = px.bar(cleaned_df[cat].value_counts().reset_index(), x="index", y=cat)
+            st.plotly_chart(fig, use_container_width=True)
         else:
+            st.warning("⚠️ Not enough columns for this visualization type.")
+    st.subheader("💬 Ask AI About Your Data")
+    user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
+    if st.button("Analyze with AI", use_container_width=True) and user_query:
+        with st.spinner("🤖 Interpreting data..."):
+            result = query_analysis_model(cleaned_df, user_query, uploaded.name)
+        st.markdown("### 💡 Insights")
+        st.markdown(result)
 else:
+    st.info("📥 Upload a dataset to begin smart analysis.")