Starberry15 commited on
Commit
34f26fc
Β·
verified Β·
1 Parent(s): 6ccbf34

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +32 -25
src/streamlit_app.py CHANGED
@@ -112,16 +112,9 @@ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
112
  return df
113
 
114
  def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
115
- """
116
- Attempts AI cleaning. Returns:
117
- - DataFrame (cleaned or original)
118
- - Message explaining status or fallback reason
119
- """
120
- # Skip cleaning if dataset too large
121
  if len(df) > 50:
122
- msg = "⚠️ AI cleaning skipped: dataset has more than 50 rows. Using original dataset for analysis."
123
- return df, msg
124
-
125
  csv_text = df.to_csv(index=False)
126
  prompt = f"""
127
  You are a professional data cleaning assistant.
@@ -140,22 +133,40 @@ Dataset:
140
  cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
141
  cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
142
  cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
143
- return cleaned_df, ""
144
  except Exception as e:
145
- msg = f"⚠️ AI cleaning failed: {e}. Using original dataset for analysis."
146
- return df, msg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  # ======================================================
149
  # 🧩 DATA ANALYSIS
150
  # ======================================================
151
  def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
152
- csv_text = df.to_csv(index=False)
153
  prompt = f"""
154
  You are a professional data analyst.
155
  Analyze the dataset '{dataset_name}' and answer the user's question.
156
 
157
- --- FULL DATA ---
158
- {csv_text}
159
 
160
  --- USER QUESTION ---
161
  {user_query}
@@ -180,7 +191,7 @@ Respond with:
180
  else:
181
  return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
182
  except Exception as e:
183
- return f"⚠️ Analysis failed: {e}"
184
 
185
  # ======================================================
186
  # πŸš€ MAIN APP LOGIC
@@ -191,25 +202,21 @@ if uploaded:
191
  df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
192
 
193
  with st.spinner("🧼 AI Cleaning your dataset..."):
194
- cleaned_df, cleaning_msg = ai_clean_dataset(df)
195
 
196
- # Show warning if cleaning skipped or failed
197
- if cleaning_msg:
198
- st.warning(cleaning_msg)
199
 
200
- st.subheader("βœ… Dataset Preview")
201
  st.dataframe(cleaned_df.head(), use_container_width=True)
202
 
203
- with st.expander("πŸ“‹ Dataset Summary", expanded=False):
204
- st.text(f"Rows: {len(cleaned_df)} | Columns: {len(cleaned_df.columns)}")
205
-
206
  st.subheader("πŸ’¬ Ask AI About Your Data")
207
  user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
208
  if st.button("Analyze with AI", use_container_width=True) and user_query:
209
  with st.spinner("πŸ€– Interpreting data..."):
 
210
  result = query_analysis_model(cleaned_df, user_query, uploaded.name)
211
  st.markdown("### πŸ’‘ Insights")
212
  st.markdown(result)
213
-
214
  else:
215
  st.info("πŸ“₯ Upload a dataset to begin smart analysis.")
 
112
  return df
113
 
114
  def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
115
+ """Returns cleaned df and a status message"""
 
 
 
 
 
116
  if len(df) > 50:
117
+ return df, "AI cleaning skipped: dataset has more than 50 rows."
 
 
118
  csv_text = df.to_csv(index=False)
119
  prompt = f"""
120
  You are a professional data cleaning assistant.
 
133
  cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
134
  cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
135
  cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
136
+ return cleaned_df, "AI cleaning completed successfully."
137
  except Exception as e:
138
+ return df, f"AI cleaning failed: {str(e)}"
139
+
140
+ # ======================================================
141
+ # 🧩 DATA SUMMARY FOR TOKEN-EFFICIENT ANALYSIS
142
+ # ======================================================
143
+ def summarize_for_analysis(df: pd.DataFrame, sample_rows=10) -> str:
144
+ summary = [f"Rows: {len(df)}, Columns: {len(df.columns)}"]
145
+ for col in df.columns:
146
+ non_null = int(df[col].notnull().sum())
147
+ if pd.api.types.is_numeric_dtype(df[col]):
148
+ desc = df[col].describe().to_dict()
149
+ summary.append(f"- {col}: mean={desc.get('mean', np.nan):.2f}, median={df[col].median():.2f}, non_null={non_null}")
150
+ else:
151
+ top = df[col].value_counts().head(3).to_dict()
152
+ summary.append(f"- {col}: top_values={top}, non_null={non_null}")
153
+ # Include a small sample
154
+ sample = df.head(sample_rows).to_csv(index=False)
155
+ summary.append("--- Sample Data ---")
156
+ summary.append(sample)
157
+ return "\n".join(summary)
158
 
159
  # ======================================================
160
  # 🧩 DATA ANALYSIS
161
  # ======================================================
162
  def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
163
+ prompt_summary = summarize_for_analysis(df)
164
  prompt = f"""
165
  You are a professional data analyst.
166
  Analyze the dataset '{dataset_name}' and answer the user's question.
167
 
168
+ --- DATA SUMMARY ---
169
+ {prompt_summary}
170
 
171
  --- USER QUESTION ---
172
  {user_query}
 
191
  else:
192
  return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
193
  except Exception as e:
194
+ return f"⚠️ Analysis failed: {str(e)}"
195
 
196
  # ======================================================
197
  # πŸš€ MAIN APP LOGIC
 
202
  df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
203
 
204
  with st.spinner("🧼 AI Cleaning your dataset..."):
205
+ cleaned_df, cleaning_status = ai_clean_dataset(df)
206
 
207
+ st.subheader("βœ… Data Cleaning Status")
208
+ st.info(cleaning_status)
 
209
 
210
+ st.subheader("πŸ“Š Dataset Preview")
211
  st.dataframe(cleaned_df.head(), use_container_width=True)
212
 
 
 
 
213
  st.subheader("πŸ’¬ Ask AI About Your Data")
214
  user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
215
  if st.button("Analyze with AI", use_container_width=True) and user_query:
216
  with st.spinner("πŸ€– Interpreting data..."):
217
+ # Analyst can work with original or cleaned dataset
218
  result = query_analysis_model(cleaned_df, user_query, uploaded.name)
219
  st.markdown("### πŸ’‘ Insights")
220
  st.markdown(result)
 
221
  else:
222
  st.info("πŸ“₯ Upload a dataset to begin smart analysis.")