Starberry15 commited on
Commit
f776692
·
verified ·
1 Parent(s): f62d086

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +60 -14
src/streamlit_app.py CHANGED
@@ -111,7 +111,12 @@ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
111
  df.drop_duplicates(inplace=True)
112
  return df
113
 
114
- def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
 
 
 
 
 
115
  csv_text = df.to_csv(index=False)
116
  prompt = f"""
117
  You are a professional data cleaning assistant.
@@ -127,18 +132,12 @@ Dataset:
127
  """
128
  try:
129
  cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
130
- except Exception as e:
131
- st.warning(f"⚠️ AI cleaning failed: {e}")
132
- return fallback_clean(df)
133
-
134
- cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
135
- try:
136
  cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
137
  cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
138
- return cleaned_df
139
  except Exception as e:
140
- st.warning(f"⚠️ AI CSV parse failed: {e}")
141
- return fallback_clean(df)
142
 
143
  # ======================================================
144
  # 🧩 DATA ANALYSIS
@@ -186,14 +185,61 @@ if uploaded:
186
  df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
187
 
188
  with st.spinner("🧼 AI Cleaning your dataset..."):
189
- cleaned_df = ai_clean_dataset(df)
 
 
 
 
190
 
191
- st.subheader("✅ Cleaned Dataset Preview")
192
  st.dataframe(cleaned_df.head(), use_container_width=True)
193
 
194
- with st.expander("📋 Cleaning Summary", expanded=False):
195
- st.text(f"Rows: {len(cleaned_df)} | Columns: {len(cleaned_df.columns)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
 
197
  st.subheader("💬 Ask AI About Your Data")
198
  user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
199
  if st.button("Analyze with AI", use_container_width=True) and user_query:
 
111
  df.drop_duplicates(inplace=True)
112
  return df
113
 
114
+ def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
115
+ """Return cleaned dataset and a message if cleaning failed."""
116
+ max_allowed_rows = 2000
117
+ if len(df) > max_allowed_rows:
118
+ return df, f"⚠️ Dataset too large for AI cleaning (>{max_allowed_rows} rows). Using original dataset."
119
+
120
  csv_text = df.to_csv(index=False)
121
  prompt = f"""
122
  You are a professional data cleaning assistant.
 
132
  """
133
  try:
134
  cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
135
+ cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
 
 
 
 
 
136
  cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
137
  cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
138
+ return cleaned_df, ""
139
  except Exception as e:
140
+ return df, f"⚠️ AI cleaning failed: {e}. Using original dataset for analysis."
 
141
 
142
  # ======================================================
143
  # 🧩 DATA ANALYSIS
 
185
  df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
186
 
187
  with st.spinner("🧼 AI Cleaning your dataset..."):
188
+ cleaned_df, cleaning_msg = ai_clean_dataset(df)
189
+
190
+ if cleaning_msg:
191
+ st.warning(cleaning_msg)
192
+ st.info("💡 Note: For AI cleaning to work best, datasets should ideally be under 2000 rows.")
193
 
194
+ st.subheader("✅ Dataset Preview")
195
  st.dataframe(cleaned_df.head(), use_container_width=True)
196
 
197
+ # ================== Quick Visualizations ==================
198
+ with st.expander("📈 Quick Visualizations", expanded=True):
199
+ numeric_cols = cleaned_df.select_dtypes(include="number").columns.tolist()
200
+ categorical_cols = cleaned_df.select_dtypes(exclude="number").columns.tolist()
201
+
202
+ viz_type = st.selectbox(
203
+ "Visualization Type",
204
+ ["Scatter Plot", "Histogram", "Box Plot", "Correlation Heatmap", "Categorical Count"]
205
+ )
206
+
207
+ if viz_type == "Scatter Plot" and len(numeric_cols) >= 2:
208
+ x = st.selectbox("X-axis", numeric_cols)
209
+ y = st.selectbox("Y-axis", numeric_cols, index=min(1, len(numeric_cols)-1))
210
+ color = st.selectbox("Color", ["None"] + categorical_cols)
211
+ fig = px.scatter(cleaned_df, x=x, y=y, color=None if color=="None" else color)
212
+ st.plotly_chart(fig, use_container_width=True)
213
+
214
+ elif viz_type == "Histogram" and numeric_cols:
215
+ col = st.selectbox("Column", numeric_cols)
216
+ fig = px.histogram(cleaned_df, x=col, nbins=30)
217
+ st.plotly_chart(fig, use_container_width=True)
218
+
219
+ elif viz_type == "Box Plot" and numeric_cols:
220
+ col = st.selectbox("Column", numeric_cols)
221
+ fig = px.box(cleaned_df, y=col)
222
+ st.plotly_chart(fig, use_container_width=True)
223
+
224
+ elif viz_type == "Correlation Heatmap" and len(numeric_cols) > 1:
225
+ corr = cleaned_df[numeric_cols].corr()
226
+ fig = ff.create_annotated_heatmap(
227
+ z=corr.values,
228
+ x=list(corr.columns),
229
+ y=list(corr.index),
230
+ annotation_text=corr.round(2).values,
231
+ showscale=True
232
+ )
233
+ st.plotly_chart(fig, use_container_width=True)
234
+
235
+ elif viz_type == "Categorical Count" and categorical_cols:
236
+ cat = st.selectbox("Category", categorical_cols)
237
+ fig = px.bar(cleaned_df[cat].value_counts().reset_index(), x="index", y=cat)
238
+ st.plotly_chart(fig, use_container_width=True)
239
+ else:
240
+ st.warning("⚠️ Not enough columns for this visualization type.")
241
 
242
+ # ================== AI Analysis ==================
243
  st.subheader("💬 Ask AI About Your Data")
244
  user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
245
  if st.button("Analyze with AI", use_container_width=True) and user_query: