Spaces:

Starberry15
/

data_analysis

Sleeping

App Files Files Community

Starberry15 commited on Oct 22

Commit

6ccbf34

verified ·

1 Parent(s): f776692

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +14 -51

src/streamlit_app.py CHANGED Viewed

@@ -112,10 +112,15 @@ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
     return df
 def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
-    """Return cleaned dataset and a message if cleaning failed."""
-    max_allowed_rows = 2000
-    if len(df) > max_allowed_rows:
-        return df, f"⚠️ Dataset too large for AI cleaning (>{max_allowed_rows} rows). Using original dataset."
     csv_text = df.to_csv(index=False)
     prompt = f"""
@@ -137,7 +142,8 @@ Dataset:
         cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
         return cleaned_df, ""
     except Exception as e:
-        return df, f"⚠️ AI cleaning failed: {e}. Using original dataset for analysis."
 # ======================================================
 # 🧩 DATA ANALYSIS
@@ -187,59 +193,16 @@ if uploaded:
     with st.spinner("🧼 AI Cleaning your dataset..."):
         cleaned_df, cleaning_msg = ai_clean_dataset(df)
     if cleaning_msg:
         st.warning(cleaning_msg)
-        st.info("💡 Note: For AI cleaning to work best, datasets should ideally be under 2000 rows.")
     st.subheader("✅ Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
-    # ================== Quick Visualizations ==================
-    with st.expander("📈 Quick Visualizations", expanded=True):
-        numeric_cols = cleaned_df.select_dtypes(include="number").columns.tolist()
-        categorical_cols = cleaned_df.select_dtypes(exclude="number").columns.tolist()
-        viz_type = st.selectbox(
-            "Visualization Type",
-            ["Scatter Plot", "Histogram", "Box Plot", "Correlation Heatmap", "Categorical Count"]
-        )
-        if viz_type == "Scatter Plot" and len(numeric_cols) >= 2:
-            x = st.selectbox("X-axis", numeric_cols)
-            y = st.selectbox("Y-axis", numeric_cols, index=min(1, len(numeric_cols)-1))
-            color = st.selectbox("Color", ["None"] + categorical_cols)
-            fig = px.scatter(cleaned_df, x=x, y=y, color=None if color=="None" else color)
-            st.plotly_chart(fig, use_container_width=True)
-        elif viz_type == "Histogram" and numeric_cols:
-            col = st.selectbox("Column", numeric_cols)
-            fig = px.histogram(cleaned_df, x=col, nbins=30)
-            st.plotly_chart(fig, use_container_width=True)
-        elif viz_type == "Box Plot" and numeric_cols:
-            col = st.selectbox("Column", numeric_cols)
-            fig = px.box(cleaned_df, y=col)
-            st.plotly_chart(fig, use_container_width=True)
-        elif viz_type == "Correlation Heatmap" and len(numeric_cols) > 1:
-            corr = cleaned_df[numeric_cols].corr()
-            fig = ff.create_annotated_heatmap(
-                z=corr.values,
-                x=list(corr.columns),
-                y=list(corr.index),
-                annotation_text=corr.round(2).values,
-                showscale=True
-            )
-            st.plotly_chart(fig, use_container_width=True)
-        elif viz_type == "Categorical Count" and categorical_cols:
-            cat = st.selectbox("Category", categorical_cols)
-            fig = px.bar(cleaned_df[cat].value_counts().reset_index(), x="index", y=cat)
-            st.plotly_chart(fig, use_container_width=True)
-        else:
-            st.warning("⚠️ Not enough columns for this visualization type.")
-    # ================== AI Analysis ==================
     st.subheader("💬 Ask AI About Your Data")
     user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
     if st.button("Analyze with AI", use_container_width=True) and user_query:

     return df
 def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
+    """
+    Attempts AI cleaning. Returns:
+    - DataFrame (cleaned or original)
+    - Message explaining status or fallback reason
+    """
+    # Skip cleaning if dataset too large
+    if len(df) > 50:
+        msg = "⚠️ AI cleaning skipped: dataset has more than 50 rows. Using original dataset for analysis."
+        return df, msg
     csv_text = df.to_csv(index=False)
     prompt = f"""
         cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
         return cleaned_df, ""
     except Exception as e:
+        msg = f"⚠️ AI cleaning failed: {e}. Using original dataset for analysis."
+        return df, msg
 # ======================================================
 # 🧩 DATA ANALYSIS
     with st.spinner("🧼 AI Cleaning your dataset..."):
         cleaned_df, cleaning_msg = ai_clean_dataset(df)
+    # Show warning if cleaning skipped or failed
     if cleaning_msg:
         st.warning(cleaning_msg)
     st.subheader("✅ Dataset Preview")
     st.dataframe(cleaned_df.head(), use_container_width=True)
+    with st.expander("📋 Dataset Summary", expanded=False):
+        st.text(f"Rows: {len(cleaned_df)} | Columns: {len(cleaned_df.columns)}")
     st.subheader("💬 Ask AI About Your Data")
     user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
     if st.button("Analyze with AI", use_container_width=True) and user_query: