Spaces:

Starberry15
/

data_analysis

Sleeping

App Files Files Community

Starberry15 commited on Oct 22

Commit

34f26fc

verified ·

1 Parent(s): 6ccbf34

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +32 -25

src/streamlit_app.py CHANGED Viewed

@@ -112,16 +112,9 @@ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
     return df
 def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
-    """
-    Attempts AI cleaning. Returns:
-    - DataFrame (cleaned or original)
-    - Message explaining status or fallback reason
-    """
-    # Skip cleaning if dataset too large
     if len(df) > 50:
-        msg = "⚠️ AI cleaning skipped: dataset has more than 50 rows. Using original dataset for analysis."
-        return df, msg
     csv_text = df.to_csv(index=False)
     prompt = f"""
 You are a professional data cleaning assistant.
@@ -140,22 +133,40 @@ Dataset:
         cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
         cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
         cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
-        return cleaned_df, ""
     except Exception as e:
-        msg = f"⚠️ AI cleaning failed: {e}. Using original dataset for analysis."
-        return df, msg
 # ======================================================
 # 🧩 DATA ANALYSIS
 # ======================================================
 def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
-    csv_text = df.to_csv(index=False)
     prompt = f"""
 You are a professional data analyst.
 Analyze the dataset '{dataset_name}' and answer the user's question.
---- FULL DATA ---
-{csv_text}
 --- USER QUESTION ---
 {user_query}
@@ -180,7 +191,7 @@ Respond with:
         else:
             return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
     except Exception as e:
-        return f"⚠️ Analysis failed: {e}"
 # ======================================================
 # 🚀 MAIN APP LOGIC
@@ -191,25 +202,21 @@ if uploaded:
     df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
     with st.spinner("🧼 AI Cleaning your dataset..."):
-        cleaned_df, cleaning_msg = ai_clean_dataset(df)
-    # Show warning if cleaning skipped or failed
-    if cleaning_msg:
-        st.warning(cleaning_msg)
-    st.subheader("✅ Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
-    with st.expander("📋 Dataset Summary", expanded=False):
-        st.text(f"Rows: {len(cleaned_df)} | Columns: {len(cleaned_df.columns)}")
     st.subheader("💬 Ask AI About Your Data")
     user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
     if st.button("Analyze with AI", use_container_width=True) and user_query:
         with st.spinner("🤖 Interpreting data..."):
             result = query_analysis_model(cleaned_df, user_query, uploaded.name)
         st.markdown("### 💡 Insights")
         st.markdown(result)
 else:
     st.info("📥 Upload a dataset to begin smart analysis.")

     return df
 def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
+    """Returns cleaned df and a status message"""
     if len(df) > 50:
+        return df, "AI cleaning skipped: dataset has more than 50 rows."
     csv_text = df.to_csv(index=False)
     prompt = f"""
 You are a professional data cleaning assistant.
         cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
         cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
         cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
+        return cleaned_df, "AI cleaning completed successfully."
     except Exception as e:
+        return df, f"AI cleaning failed: {str(e)}"
+# ======================================================
+# 🧩 DATA SUMMARY FOR TOKEN-EFFICIENT ANALYSIS
+# ======================================================
+def summarize_for_analysis(df: pd.DataFrame, sample_rows=10) -> str:
+    summary = [f"Rows: {len(df)}, Columns: {len(df.columns)}"]
+    for col in df.columns:
+        non_null = int(df[col].notnull().sum())
+        if pd.api.types.is_numeric_dtype(df[col]):
+            desc = df[col].describe().to_dict()
+            summary.append(f"- {col}: mean={desc.get('mean', np.nan):.2f}, median={df[col].median():.2f}, non_null={non_null}")
+        else:
+            top = df[col].value_counts().head(3).to_dict()
+            summary.append(f"- {col}: top_values={top}, non_null={non_null}")
+    # Include a small sample
+    sample = df.head(sample_rows).to_csv(index=False)
+    summary.append("--- Sample Data ---")
+    summary.append(sample)
+    return "\n".join(summary)
 # ======================================================
 # 🧩 DATA ANALYSIS
 # ======================================================
 def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
+    prompt_summary = summarize_for_analysis(df)
     prompt = f"""
 You are a professional data analyst.
 Analyze the dataset '{dataset_name}' and answer the user's question.
+--- DATA SUMMARY ---
+{prompt_summary}
 --- USER QUESTION ---
 {user_query}
         else:
             return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
     except Exception as e:
+        return f"⚠️ Analysis failed: {str(e)}"
 # ======================================================
 # 🚀 MAIN APP LOGIC
     df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
     with st.spinner("🧼 AI Cleaning your dataset..."):
+        cleaned_df, cleaning_status = ai_clean_dataset(df)
+    st.subheader("✅ Data Cleaning Status")
+    st.info(cleaning_status)
+    st.subheader("📊 Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
     st.subheader("💬 Ask AI About Your Data")
     user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
     if st.button("Analyze with AI", use_container_width=True) and user_query:
         with st.spinner("🤖 Interpreting data..."):
+            # Analyst can work with original or cleaned dataset
             result = query_analysis_model(cleaned_df, user_query, uploaded.name)
         st.markdown("### 💡 Insights")
         st.markdown(result)
 else:
     st.info("📥 Upload a dataset to begin smart analysis.")