Spaces:
Sleeping
Sleeping
Update src/streamlit_app.py
Browse files- src/streamlit_app.py +7 -7
src/streamlit_app.py
CHANGED
|
@@ -114,7 +114,7 @@ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
|
|
| 114 |
def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
|
| 115 |
"""Returns cleaned df and a status message"""
|
| 116 |
if len(df) > 50:
|
| 117 |
-
return df, "AI cleaning skipped: dataset has more than 50 rows."
|
| 118 |
csv_text = df.to_csv(index=False)
|
| 119 |
prompt = f"""
|
| 120 |
You are a professional data cleaning assistant.
|
|
@@ -133,9 +133,9 @@ Dataset:
|
|
| 133 |
cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
|
| 134 |
cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
|
| 135 |
cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
|
| 136 |
-
return cleaned_df, "AI cleaning completed successfully."
|
| 137 |
except Exception as e:
|
| 138 |
-
return df, f"AI cleaning failed: {str(e)}"
|
| 139 |
|
| 140 |
# ======================================================
|
| 141 |
# 🧩 DATA SUMMARY FOR TOKEN-EFFICIENT ANALYSIS
|
|
@@ -150,7 +150,7 @@ def summarize_for_analysis(df: pd.DataFrame, sample_rows=10) -> str:
|
|
| 150 |
else:
|
| 151 |
top = df[col].value_counts().head(3).to_dict()
|
| 152 |
summary.append(f"- {col}: top_values={top}, non_null={non_null}")
|
| 153 |
-
# Include a small sample
|
| 154 |
sample = df.head(sample_rows).to_csv(index=False)
|
| 155 |
summary.append("--- Sample Data ---")
|
| 156 |
summary.append(sample)
|
|
@@ -181,13 +181,13 @@ Respond with:
|
|
| 181 |
if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
|
| 182 |
if GEMINI_API_KEY is None:
|
| 183 |
return "⚠️ Gemini API key missing."
|
| 184 |
-
response = genai.
|
| 185 |
model="gemini-2.5-flash",
|
| 186 |
-
|
| 187 |
temperature=temperature,
|
| 188 |
max_output_tokens=max_tokens
|
| 189 |
)
|
| 190 |
-
return
|
| 191 |
else:
|
| 192 |
return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
|
| 193 |
except Exception as e:
|
|
|
|
| 114 |
def ai_clean_dataset(df: pd.DataFrame) -> (pd.DataFrame, str):
|
| 115 |
"""Returns cleaned df and a status message"""
|
| 116 |
if len(df) > 50:
|
| 117 |
+
return df, "⚠️ AI cleaning skipped: dataset has more than 50 rows."
|
| 118 |
csv_text = df.to_csv(index=False)
|
| 119 |
prompt = f"""
|
| 120 |
You are a professional data cleaning assistant.
|
|
|
|
| 133 |
cleaned_str = cleaned_str.replace("```csv", "").replace("```", "").replace("###", "").strip()
|
| 134 |
cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
|
| 135 |
cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
|
| 136 |
+
return cleaned_df, "✅ AI cleaning completed successfully."
|
| 137 |
except Exception as e:
|
| 138 |
+
return df, f"⚠️ AI cleaning failed: {str(e)}"
|
| 139 |
|
| 140 |
# ======================================================
|
| 141 |
# 🧩 DATA SUMMARY FOR TOKEN-EFFICIENT ANALYSIS
|
|
|
|
| 150 |
else:
|
| 151 |
top = df[col].value_counts().head(3).to_dict()
|
| 152 |
summary.append(f"- {col}: top_values={top}, non_null={non_null}")
|
| 153 |
+
# Include a small sample for context
|
| 154 |
sample = df.head(sample_rows).to_csv(index=False)
|
| 155 |
summary.append("--- Sample Data ---")
|
| 156 |
summary.append(sample)
|
|
|
|
| 181 |
if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
|
| 182 |
if GEMINI_API_KEY is None:
|
| 183 |
return "⚠️ Gemini API key missing."
|
| 184 |
+
response = genai.models.generate(
|
| 185 |
model="gemini-2.5-flash",
|
| 186 |
+
messages=[{"author": "user", "content": prompt}],
|
| 187 |
temperature=temperature,
|
| 188 |
max_output_tokens=max_tokens
|
| 189 |
)
|
| 190 |
+
return response.candidates[0].content if response.candidates else "No response from Gemini."
|
| 191 |
else:
|
| 192 |
return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
|
| 193 |
except Exception as e:
|