Spaces:

Starberry15
/

data_analysis

Sleeping

App Files Files Community

Starberry15 commited on Oct 22

Commit

c627e4b

verified ·

1 Parent(s): 081976e

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +36 -31

src/streamlit_app.py CHANGED Viewed

@@ -13,8 +13,8 @@ from io import StringIO
 # ⚙️ APP CONFIGURATION
 # ======================================================
 st.set_page_config(page_title="📊 Smart Data Analyst Pro", layout="wide")
-st.title("📊 Smart Data Analyst Pro")
-st.caption("AI that cleans, analyzes, and visualizes your data — Hugging Face + Gemini compatible.")
 # ======================================================
 # 🔐 Load Environment Variables
@@ -29,7 +29,7 @@ else:
     login(token=HF_TOKEN)
 if GEMINI_API_KEY:
-    genai.api_key = GEMINI_API_KEY
 else:
     st.warning("⚠️ Gemini API key missing. Gemini 2.5 Flash will not work.")
@@ -51,6 +51,7 @@ with st.sidebar:
     ANALYST_MODEL = st.selectbox(
         "Select Analysis Model:",
         [
             "Qwen/Qwen2.5-14B-Instruct",
             "mistralai/Mistral-7B-Instruct-v0.3",
             "HuggingFaceH4/zephyr-7b-beta"
@@ -61,7 +62,6 @@ with st.sidebar:
     temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
     max_tokens = st.slider("Max Tokens", 128, 4096, 1024)
-# Initialize HF clients
 hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
 hf_analyst_client = None
 if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
@@ -80,7 +80,7 @@ def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
         )
         return resp.strip()
     except Exception as e:
-        if "Supported task: conversational" in str(e) or "not supported" in str(e):
             chat_resp = client.chat_completion(
                 messages=[{"role": "user", "content": prompt}],
                 max_tokens=max_tokens,
@@ -91,7 +91,7 @@ def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
             raise e
 # ======================================================
-# 🧩 SMART DATA CLEANING
 # ======================================================
 def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
     df = df.copy()
@@ -109,7 +109,6 @@ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
     return df
 def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
-    """Returns cleaned df and a status message"""
     if len(df) > 50:
         return df, "⚠️ AI cleaning skipped: dataset has more than 50 rows."
     csv_text = df.to_csv(index=False)
@@ -135,7 +134,7 @@ Dataset:
         return df, f"⚠️ AI cleaning failed: {str(e)}"
 # ======================================================
-# 🧩 DATA SUMMARY FOR TOKEN-EFFICIENT ANALYSIS
 # ======================================================
 def summarize_for_analysis(df: pd.DataFrame, sample_rows=10) -> str:
     summary = [f"Rows: {len(df)}, Columns: {len(df.columns)}"]
@@ -147,14 +146,13 @@ def summarize_for_analysis(df: pd.DataFrame, sample_rows=10) -> str:
         else:
             top = df[col].value_counts().head(3).to_dict()
             summary.append(f"- {col}: top_values={top}, non_null={non_null}")
-    # Include a small sample for context
     sample = df.head(sample_rows).to_csv(index=False)
     summary.append("--- Sample Data ---")
     summary.append(sample)
     return "\n".join(summary)
 # ======================================================
-# 🧩 DATA ANALYSIS
 # ======================================================
 def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
     prompt_summary = summarize_for_analysis(df)
@@ -176,44 +174,51 @@ Respond with:
 """
     try:
         if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
-            if GEMINI_API_KEY is None:
-                return "⚠️ Gemini API key missing."
-            response = genai.models.generate(
-                model="gemini-2.5-flash",
-                messages=[{"author": "user", "content": prompt}],
-                temperature=temperature,
-                max_output_tokens=max_tokens
             )
-            return response.candidates[0].content if response.candidates else "No response from Gemini."
         else:
             return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
     except Exception as e:
         return f"⚠️ Analysis failed: {str(e)}"
 # ======================================================
-# 🚀 MAIN APP LOGIC
 # ======================================================
 uploaded = st.file_uploader("📎 Upload CSV or Excel file", type=["csv", "xlsx"])
 if uploaded:
     df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
-    with st.spinner("🧼 AI Cleaning your dataset..."):
         cleaned_df, cleaning_status = ai_clean_dataset(df)
-    st.subheader("✅ Data Cleaning Status")
     st.info(cleaning_status)
     st.subheader("📊 Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
-    st.subheader("💬 Ask AI About Your Data")
-    user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
-    if st.button("Analyze with AI", use_container_width=True) and user_query:
-        with st.spinner("🤖 Interpreting data..."):
-            # Analyst can work with original or cleaned dataset
-            result = query_analysis_model(cleaned_df, user_query, uploaded.name)
-        st.markdown("### 💡 Insights")
-        st.markdown(result)
 else:
-    st.info("📥 Upload a dataset to begin smart analysis.")

 # ⚙️ APP CONFIGURATION
 # ======================================================
 st.set_page_config(page_title="📊 Smart Data Analyst Pro", layout="wide")
+st.title("📊 Smart Data Analyst Pro (Chat Mode)")
+st.caption("Chat with your dataset — AI cleans, analyzes, and visualizes data. Hugging Face + Gemini compatible.")
 # ======================================================
 # 🔐 Load Environment Variables
     login(token=HF_TOKEN)
 if GEMINI_API_KEY:
+    genai.configure(api_key=GEMINI_API_KEY)
 else:
     st.warning("⚠️ Gemini API key missing. Gemini 2.5 Flash will not work.")
     ANALYST_MODEL = st.selectbox(
         "Select Analysis Model:",
         [
+            "Gemini 2.5 Flash (Google)",
             "Qwen/Qwen2.5-14B-Instruct",
             "mistralai/Mistral-7B-Instruct-v0.3",
             "HuggingFaceH4/zephyr-7b-beta"
     temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
     max_tokens = st.slider("Max Tokens", 128, 4096, 1024)
 hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
 hf_analyst_client = None
 if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
         )
         return resp.strip()
     except Exception as e:
+        if "Supported task: conversational" in str(e):
             chat_resp = client.chat_completion(
                 messages=[{"role": "user", "content": prompt}],
                 max_tokens=max_tokens,
             raise e
 # ======================================================
+# 🧩 DATA CLEANING
 # ======================================================
 def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
     df = df.copy()
     return df
 def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
     if len(df) > 50:
         return df, "⚠️ AI cleaning skipped: dataset has more than 50 rows."
     csv_text = df.to_csv(index=False)
         return df, f"⚠️ AI cleaning failed: {str(e)}"
 # ======================================================
+# 🧩 DATA SUMMARY (Token-efficient)
 # ======================================================
 def summarize_for_analysis(df: pd.DataFrame, sample_rows=10) -> str:
     summary = [f"Rows: {len(df)}, Columns: {len(df.columns)}"]
         else:
             top = df[col].value_counts().head(3).to_dict()
             summary.append(f"- {col}: top_values={top}, non_null={non_null}")
     sample = df.head(sample_rows).to_csv(index=False)
     summary.append("--- Sample Data ---")
     summary.append(sample)
     return "\n".join(summary)
 # ======================================================
+# 🧠 ANALYSIS FUNCTION
 # ======================================================
 def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
     prompt_summary = summarize_for_analysis(df)
 """
     try:
         if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
+            response = genai.GenerativeModel("gemini-2.5-flash").generate_content(
+                prompt,
+                generation_config={
+                    "temperature": temperature,
+                    "max_output_tokens": max_tokens
+                }
             )
+            return response.text if hasattr(response, "text") else "No valid text response."
         else:
             return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
     except Exception as e:
         return f"⚠️ Analysis failed: {str(e)}"
 # ======================================================
+# 🚀 MAIN CHATBOT LOGIC
 # ======================================================
 uploaded = st.file_uploader("📎 Upload CSV or Excel file", type=["csv", "xlsx"])
+if "messages" not in st.session_state:
+    st.session_state.messages = []
 if uploaded:
     df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
+    with st.spinner("🧼 Cleaning your dataset..."):
         cleaned_df, cleaning_status = ai_clean_dataset(df)
+    st.subheader("✅ Cleaning Status")
     st.info(cleaning_status)
     st.subheader("📊 Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
+    st.subheader("💬 Chat with Your Dataset")
+    for msg in st.session_state.messages:
+        with st.chat_message(msg["role"]):
+            st.markdown(msg["content"])
+    if user_query := st.chat_input("Ask something about your dataset..."):
+        st.session_state.messages.append({"role": "user", "content": user_query})
+        with st.chat_message("user"):
+            st.markdown(user_query)
+        with st.chat_message("assistant"):
+            with st.spinner("🤖 Analyzing..."):
+                result = query_analysis_model(cleaned_df, user_query, uploaded.name)
+                st.markdown(result)
+                st.session_state.messages.append({"role": "assistant", "content": result})
 else:
+    st.info("📥 Upload a dataset to begin chatting with your AI analyst.")