Spaces:

Starberry15
/

data_analysis

Sleeping

App Files Files Community

Starberry15 commited on Oct 22

Commit

f776692

verified ·

1 Parent(s): f62d086

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +60 -14

src/streamlit_app.py CHANGED Viewed

@@ -111,7 +111,12 @@ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
     df.drop_duplicates(inplace=True)
     return df
-def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
     csv_text = df.to_csv(index=False)
     prompt = f"""
 You are a professional data cleaning assistant.
@@ -127,18 +132,12 @@ Dataset:
 """
     try:
         cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
-    except Exception as e:
-        st.warning(f"⚠️ AI cleaning failed: {e}")
-        return fallback_clean(df)
-    cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
-    try:
         cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
         cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
-        return cleaned_df
     except Exception as e:
-        st.warning(f"⚠️ AI CSV parse failed: {e}")
-        return fallback_clean(df)
 # ======================================================
 # 🧩 DATA ANALYSIS
@@ -186,14 +185,61 @@ if uploaded:
     df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
     with st.spinner("🧼 AI Cleaning your dataset..."):
-        cleaned_df = ai_clean_dataset(df)
-    st.subheader("✅ Cleaned Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
-    with st.expander("📋 Cleaning Summary", expanded=False):
-        st.text(f"Rows: {len(cleaned_df)} | Columns: {len(cleaned_df.columns)}")
     st.subheader("💬 Ask AI About Your Data")
     user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
     if st.button("Analyze with AI", use_container_width=True) and user_query:

     df.drop_duplicates(inplace=True)
     return df
+def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
+    """Return cleaned dataset and a message if cleaning failed."""
+    max_allowed_rows = 2000
+    if len(df) > max_allowed_rows:
+        return df, f"⚠️ Dataset too large for AI cleaning (>{max_allowed_rows} rows). Using original dataset."
     csv_text = df.to_csv(index=False)
     prompt = f"""
 You are a professional data cleaning assistant.
 """
     try:
         cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
+        cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
         cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
         cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
+        return cleaned_df, ""
     except Exception as e:
+        return df, f"⚠️ AI cleaning failed: {e}. Using original dataset for analysis."
 # ======================================================
 # 🧩 DATA ANALYSIS
     df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
     with st.spinner("🧼 AI Cleaning your dataset..."):
+        cleaned_df, cleaning_msg = ai_clean_dataset(df)
+    if cleaning_msg:
+        st.warning(cleaning_msg)
+        st.info("💡 Note: For AI cleaning to work best, datasets should ideally be under 2000 rows.")
+    st.subheader("✅ Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
+    # ================== Quick Visualizations ==================
+    with st.expander("📈 Quick Visualizations", expanded=True):
+        numeric_cols = cleaned_df.select_dtypes(include="number").columns.tolist()
+        categorical_cols = cleaned_df.select_dtypes(exclude="number").columns.tolist()
+        viz_type = st.selectbox(
+            "Visualization Type",
+            ["Scatter Plot", "Histogram", "Box Plot", "Correlation Heatmap", "Categorical Count"]
+        )
+        if viz_type == "Scatter Plot" and len(numeric_cols) >= 2:
+            x = st.selectbox("X-axis", numeric_cols)
+            y = st.selectbox("Y-axis", numeric_cols, index=min(1, len(numeric_cols)-1))
+            color = st.selectbox("Color", ["None"] + categorical_cols)
+            fig = px.scatter(cleaned_df, x=x, y=y, color=None if color=="None" else color)
+            st.plotly_chart(fig, use_container_width=True)
+        elif viz_type == "Histogram" and numeric_cols:
+            col = st.selectbox("Column", numeric_cols)
+            fig = px.histogram(cleaned_df, x=col, nbins=30)
+            st.plotly_chart(fig, use_container_width=True)
+        elif viz_type == "Box Plot" and numeric_cols:
+            col = st.selectbox("Column", numeric_cols)
+            fig = px.box(cleaned_df, y=col)
+            st.plotly_chart(fig, use_container_width=True)
+        elif viz_type == "Correlation Heatmap" and len(numeric_cols) > 1:
+            corr = cleaned_df[numeric_cols].corr()
+            fig = ff.create_annotated_heatmap(
+                z=corr.values,
+                x=list(corr.columns),
+                y=list(corr.index),
+                annotation_text=corr.round(2).values,
+                showscale=True
+            )
+            st.plotly_chart(fig, use_container_width=True)
+        elif viz_type == "Categorical Count" and categorical_cols:
+            cat = st.selectbox("Category", categorical_cols)
+            fig = px.bar(cleaned_df[cat].value_counts().reset_index(), x="index", y=cat)
+            st.plotly_chart(fig, use_container_width=True)
+        else:
+            st.warning("⚠️ Not enough columns for this visualization type.")
+    # ================== AI Analysis ==================
     st.subheader("💬 Ask AI About Your Data")
     user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
     if st.button("Analyze with AI", use_container_width=True) and user_query: