Starberry15 commited on
Commit
6ccbf34
·
verified ·
1 Parent(s): f776692

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +14 -51
src/streamlit_app.py CHANGED
@@ -112,10 +112,15 @@ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
112
  return df
113
 
114
  def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
115
- """Return cleaned dataset and a message if cleaning failed."""
116
- max_allowed_rows = 2000
117
- if len(df) > max_allowed_rows:
118
- return df, f"⚠️ Dataset too large for AI cleaning (>{max_allowed_rows} rows). Using original dataset."
 
 
 
 
 
119
 
120
  csv_text = df.to_csv(index=False)
121
  prompt = f"""
@@ -137,7 +142,8 @@ Dataset:
137
  cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
138
  return cleaned_df, ""
139
  except Exception as e:
140
- return df, f"⚠️ AI cleaning failed: {e}. Using original dataset for analysis."
 
141
 
142
  # ======================================================
143
  # 🧩 DATA ANALYSIS
@@ -187,59 +193,16 @@ if uploaded:
187
  with st.spinner("🧼 AI Cleaning your dataset..."):
188
  cleaned_df, cleaning_msg = ai_clean_dataset(df)
189
 
 
190
  if cleaning_msg:
191
  st.warning(cleaning_msg)
192
- st.info("💡 Note: For AI cleaning to work best, datasets should ideally be under 2000 rows.")
193
 
194
  st.subheader("✅ Dataset Preview")
195
  st.dataframe(cleaned_df.head(), use_container_width=True)
196
 
197
- # ================== Quick Visualizations ==================
198
- with st.expander("📈 Quick Visualizations", expanded=True):
199
- numeric_cols = cleaned_df.select_dtypes(include="number").columns.tolist()
200
- categorical_cols = cleaned_df.select_dtypes(exclude="number").columns.tolist()
201
 
202
- viz_type = st.selectbox(
203
- "Visualization Type",
204
- ["Scatter Plot", "Histogram", "Box Plot", "Correlation Heatmap", "Categorical Count"]
205
- )
206
-
207
- if viz_type == "Scatter Plot" and len(numeric_cols) >= 2:
208
- x = st.selectbox("X-axis", numeric_cols)
209
- y = st.selectbox("Y-axis", numeric_cols, index=min(1, len(numeric_cols)-1))
210
- color = st.selectbox("Color", ["None"] + categorical_cols)
211
- fig = px.scatter(cleaned_df, x=x, y=y, color=None if color=="None" else color)
212
- st.plotly_chart(fig, use_container_width=True)
213
-
214
- elif viz_type == "Histogram" and numeric_cols:
215
- col = st.selectbox("Column", numeric_cols)
216
- fig = px.histogram(cleaned_df, x=col, nbins=30)
217
- st.plotly_chart(fig, use_container_width=True)
218
-
219
- elif viz_type == "Box Plot" and numeric_cols:
220
- col = st.selectbox("Column", numeric_cols)
221
- fig = px.box(cleaned_df, y=col)
222
- st.plotly_chart(fig, use_container_width=True)
223
-
224
- elif viz_type == "Correlation Heatmap" and len(numeric_cols) > 1:
225
- corr = cleaned_df[numeric_cols].corr()
226
- fig = ff.create_annotated_heatmap(
227
- z=corr.values,
228
- x=list(corr.columns),
229
- y=list(corr.index),
230
- annotation_text=corr.round(2).values,
231
- showscale=True
232
- )
233
- st.plotly_chart(fig, use_container_width=True)
234
-
235
- elif viz_type == "Categorical Count" and categorical_cols:
236
- cat = st.selectbox("Category", categorical_cols)
237
- fig = px.bar(cleaned_df[cat].value_counts().reset_index(), x="index", y=cat)
238
- st.plotly_chart(fig, use_container_width=True)
239
- else:
240
- st.warning("⚠️ Not enough columns for this visualization type.")
241
-
242
- # ================== AI Analysis ==================
243
  st.subheader("💬 Ask AI About Your Data")
244
  user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
245
  if st.button("Analyze with AI", use_container_width=True) and user_query:
 
112
  return df
113
 
114
  def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
115
+ """
116
+ Attempts AI cleaning. Returns:
117
+ - DataFrame (cleaned or original)
118
+ - Message explaining status or fallback reason
119
+ """
120
+ # Skip cleaning if dataset too large
121
+ if len(df) > 50:
122
+ msg = "⚠️ AI cleaning skipped: dataset has more than 50 rows. Using original dataset for analysis."
123
+ return df, msg
124
 
125
  csv_text = df.to_csv(index=False)
126
  prompt = f"""
 
142
  cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
143
  return cleaned_df, ""
144
  except Exception as e:
145
+ msg = f"⚠️ AI cleaning failed: {e}. Using original dataset for analysis."
146
+ return df, msg
147
 
148
  # ======================================================
149
  # 🧩 DATA ANALYSIS
 
193
  with st.spinner("🧼 AI Cleaning your dataset..."):
194
  cleaned_df, cleaning_msg = ai_clean_dataset(df)
195
 
196
+ # Show warning if cleaning skipped or failed
197
  if cleaning_msg:
198
  st.warning(cleaning_msg)
 
199
 
200
  st.subheader("✅ Dataset Preview")
201
  st.dataframe(cleaned_df.head(), use_container_width=True)
202
 
203
+ with st.expander("📋 Dataset Summary", expanded=False):
204
+ st.text(f"Rows: {len(cleaned_df)} | Columns: {len(cleaned_df.columns)}")
 
 
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  st.subheader("💬 Ask AI About Your Data")
207
  user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
208
  if st.button("Analyze with AI", use_container_width=True) and user_query: