Rustamshry commited on
Commit
c519da6
Β·
verified Β·
1 Parent(s): 12882fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +495 -0
app.py CHANGED
@@ -0,0 +1,495 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import matplotlib.pyplot as plt
3
+ import seaborn as sns
4
+ import io
5
+ from PIL import Image
6
+ import gradio as gr
7
+ from smolagents import tool, CodeAgent, InferenceClientModel
8
+ from sklearn.model_selection import train_test_split
9
+ from sklearn.metrics import accuracy_score, classification_report, r2_score, mean_squared_error
10
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
11
+ from sklearn.linear_model import LogisticRegression, LinearRegression
12
+ import joblib
13
+ import tempfile
14
+ import os
15
+
16
+
17
+ # πŸ”‘ Set your HF API key
18
+ def set_hf_token(token):
19
+ os.environ["HF_TOKEN"] = token.strip()
20
+ return "βœ… Token saved successfully! You can now upload your CSV file."
21
+
22
+
23
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
24
+ # πŸ” Heuristic Target Column Detection
25
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
26
+
27
+ def detect_target_column(df: pd.DataFrame) -> str:
28
+ """
29
+ Heuristically detect the most likely target column based on naming, cardinality, and type.
30
+ """
31
+ if df.empty or len(df.columns) < 2:
32
+ return None
33
+
34
+ scores = {}
35
+
36
+ for col in df.columns:
37
+ score = 0.0
38
+ name_lower = col.lower()
39
+
40
+ # Rule 1: Name matches common target keywords
41
+ keywords = ["target", "label", "class", "outcome", "result", "y", "output", "flag", "status", "churn", "survived", "price", "sale"]
42
+ if any(kw in name_lower for kw in keywords):
43
+ score += 3.0
44
+ if name_lower in ["target", "label", "class", "y"]:
45
+ score += 2.0
46
+
47
+ # Rule 2: Binary or low-cardinality categorical β†’ likely classification
48
+ nunique = df[col].nunique()
49
+ total = len(df)
50
+ unique_ratio = nunique / total
51
+
52
+ if nunique == 2 and df[col].dtype in ["int64", "object", "category"]:
53
+ score += 4.0 # Strong signal
54
+ elif nunique <= 20 and df[col].dtype in ["int64", "object", "category"]:
55
+ score += 3.0
56
+
57
+ # Rule 3: High unique ratio + numeric β†’ likely regression target
58
+ if unique_ratio > 0.8 and df[col].dtype in ["int64", "float64"]:
59
+ score += 2.5
60
+
61
+ # Rule 4: Avoid ID-like or high-cardinality text
62
+ id_keywords = ["id", "name", "email", "phone", "address", "username", "url", "link"]
63
+ if any(kw in name_lower for kw in id_keywords):
64
+ score -= 10.0
65
+ if nunique == total and df[col].dtype == "object":
66
+ score -= 10.0 # Likely unique identifier
67
+
68
+ scores[col] = score
69
+
70
+ # Return best candidate if score > 0
71
+ best_col = max(scores, key=scores.get)
72
+ return best_col if scores[best_col] > 0 else None
73
+
74
+
75
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
76
+ # πŸ› οΈ Tool 1: LoadData
77
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
78
+
79
+ @tool
80
+ def LoadData(filepath: str) -> dict:
81
+ """
82
+ Loads data from a CSV file and returns it as a dictionary.
83
+
84
+ Args:
85
+ filepath (str): Path to the CSV file.
86
+
87
+ Returns:
88
+ dict: Data as dictionary (from DataFrame.to_dict()).
89
+ """
90
+ df = pd.read_csv(filepath)
91
+ return df.to_dict()
92
+
93
+
94
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
95
+ # πŸ› οΈ Tool 2: CleanData (Enhanced)
96
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
97
+
98
+ @tool
99
+ def CleanData(data: dict, handle_outliers: bool = True, impute_strategy: str = "median_mode") -> pd.DataFrame:
100
+ """
101
+ Cleans dataset with smart imputation, encoding, and optional outlier removal.
102
+
103
+ Args:
104
+ data (dict): Dataset in dictionary format.
105
+ handle_outliers (bool): Whether to remove outliers using IQR.
106
+ impute_strategy (str): "median_mode" or "mean_mode"
107
+
108
+ Returns:
109
+ pd.DataFrame: Cleaned dataset.
110
+ """
111
+ df = pd.DataFrame.from_dict(data)
112
+
113
+ # Drop duplicates
114
+ df = df.drop_duplicates().reset_index(drop=True)
115
+
116
+ # Handle missing values
117
+ for col in df.columns:
118
+ if df[col].dtype in ["int64", "float64"]:
119
+ if impute_strategy == "median_mode" or df[col].skew() > 1:
120
+ fill_val = df[col].median()
121
+ else:
122
+ fill_val = df[col].mean()
123
+ df[col] = df[col].fillna(fill_val)
124
+ else:
125
+ mode = df[col].mode()
126
+ fill_val = mode[0] if len(mode) > 0 else "Unknown"
127
+ df[col] = df[col].fillna(fill_val)
128
+
129
+ # Parse datetime
130
+ for col in df.columns:
131
+ if "date" in col.lower() or "time" in col.lower():
132
+ try:
133
+ df[col] = pd.to_datetime(df[col], infer_datetime_format=True, errors="coerce")
134
+ except:
135
+ pass
136
+
137
+ # Encode categorical variables (only if not too many unique values)
138
+ for col in df.select_dtypes(include="object").columns:
139
+ if df[col].nunique() / len(df) < 0.5:
140
+ df[col] = df[col].astype("category").cat.codes
141
+ # else: leave as object (e.g., free text)
142
+
143
+ # Outlier removal (optional)
144
+ if handle_outliers:
145
+ for col in df.select_dtypes(include=["float64", "int64"]).columns:
146
+ Q1 = df[col].quantile(0.25)
147
+ Q3 = df[col].quantile(0.75)
148
+ IQR = Q3 - Q1
149
+ lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
150
+ count_before = len(df)
151
+ df = df[(df[col] >= lower) & (df[col] <= upper)]
152
+ if len(df) == 0:
153
+ # Avoid empty df
154
+ df = pd.DataFrame.from_dict(data) # Revert
155
+ break
156
+
157
+ return df.reset_index(drop=True)
158
+
159
+
160
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
161
+ # πŸ“Š Tool 3: EDA (Enhanced)
162
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
163
+
164
+ @tool
165
+ def EDA(data: dict, max_cat_plots: int = 3, max_num_plots: int = 3) -> dict:
166
+ """
167
+ Performs advanced EDA with smart visualizations and insights.
168
+
169
+ Args:
170
+ data (dict): Dataset in dictionary format.
171
+ max_cat_plots (int): Max number of categorical distribution plots.
172
+ max_num_plots (int): Max number of numeric vs target plots.
173
+
174
+ Returns:
175
+ dict: EDA results including text, plots, and recommendations.
176
+ """
177
+ df = pd.DataFrame.from_dict(data)
178
+ results = {}
179
+
180
+ # 1. Summary Stats
181
+ results["summary"] = df.describe(include="all").to_string()
182
+
183
+ # 2. Missing Values
184
+ missing = df.isnull().sum()
185
+ results["missing_values"] = missing[missing > 0].to_dict()
186
+
187
+ # Missingness heatmap
188
+ if missing.sum() > 0:
189
+ plt.figure(figsize=(8, 4))
190
+ sns.heatmap(df.isnull(), cbar=True, cmap="viridis", yticklabels=False)
191
+ buf = io.BytesIO()
192
+ plt.savefig(buf, format="png", bbox_inches="tight")
193
+ plt.close()
194
+ buf.seek(0)
195
+ img = Image.open(buf)
196
+ results["missingness_plot"] = img #buf
197
+
198
+ # 3. Correlation Heatmap
199
+ corr = df.corr(numeric_only=True)
200
+ if not corr.empty and len(corr.columns) > 1:
201
+ plt.figure(figsize=(8, 6))
202
+ sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", square=True)
203
+ buf = io.BytesIO()
204
+ plt.savefig(buf, format="png", bbox_inches="tight")
205
+ plt.close()
206
+ buf.seek(0)
207
+ img = Image.open(buf)
208
+ results["correlation_plot"] = img #buf
209
+
210
+ # Top 5 absolute correlations
211
+ unstacked = corr.abs().unstack()
212
+ unstacked = unstacked[unstacked < 1.0]
213
+ top_corr = unstacked.sort_values(ascending=False).head(5).to_dict()
214
+ results["top_correlations"] = top_corr
215
+
216
+ # 4. Skewness & Kurtosis
217
+ numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
218
+ skew_kurt = {}
219
+ for col in numeric_cols:
220
+ skew_kurt[col] = {"skew": df[col].skew(), "kurtosis": df[col].kurtosis()}
221
+ results["skew_kurtosis"] = skew_kurt
222
+
223
+ # 5. Numeric Distributions
224
+ if len(numeric_cols) > 0:
225
+ df[numeric_cols].hist(bins=20, figsize=(12, 8), layout=(2, -1))
226
+ buf = io.BytesIO()
227
+ plt.savefig(buf, format="png", bbox_inches="tight")
228
+ plt.close()
229
+ buf.seek(0)
230
+ img = Image.open(buf)
231
+ results["numeric_distributions"] = img #buf
232
+
233
+ # 6. Categorical Distributions
234
+ cat_cols = df.select_dtypes(include=["object", "category"]).columns
235
+ for col in cat_cols[:max_cat_plots]:
236
+ plt.figure(figsize=(6, 4))
237
+ top_vals = df[col].value_counts().head(10)
238
+ sns.barplot(x=top_vals.index, y=top_vals.values)
239
+ plt.xticks(rotation=45)
240
+ buf = io.BytesIO()
241
+ plt.savefig(buf, format="png", bbox_inches="tight")
242
+ plt.close()
243
+ buf.seek(0)
244
+ img = Image.open(buf)
245
+ results[f"dist_{col}"] = img #buf
246
+
247
+ # 7. Target Relationships
248
+ target_col = detect_target_column(df)
249
+ if target_col:
250
+ results["detected_target"] = target_col
251
+ for col in numeric_cols[:max_num_plots]:
252
+ plt.figure(figsize=(6, 4))
253
+ if df[target_col].nunique() <= 20:
254
+ sns.boxplot(data=df, x=target_col, y=col)
255
+ else:
256
+ sns.scatterplot(data=df, x=col, y=target_col)
257
+ buf = io.BytesIO()
258
+ plt.savefig(buf, format="png", bbox_inches="tight")
259
+ plt.close()
260
+ buf.seek(0)
261
+ img = Image.open(buf)
262
+ results[f"{col}_vs_{target_col}"] = img #buf
263
+
264
+ # 8. Recommendations
265
+ recs = []
266
+ for col, sk in skew_kurt.items():
267
+ if abs(sk["skew"]) > 1:
268
+ recs.append(f"Feature '{col}' is skewed ({sk['skew']:.2f}) β†’ consider log transform.")
269
+ if results["missing_values"]:
270
+ recs.append("Missing data detected β†’ consider KNN or iterative imputation.")
271
+ if results.get("top_correlations"):
272
+ recs.append("High correlations found β†’ consider PCA or feature selection.")
273
+ if target_col:
274
+ recs.append(f"Target variable '{target_col}' detected automatically.")
275
+ results["recommendations"] = recs
276
+
277
+ return results
278
+
279
+
280
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
281
+ # πŸ€– Tool 4: AutoML (Enhanced)
282
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
283
+
284
+ @tool
285
+ def AutoML(data: dict, task_hint: str = None) -> dict:
286
+ """
287
+ Enhanced AutoML with multiple models and robust evaluation.
288
+
289
+ Args:
290
+ data (dict): Cleaned dataset.
291
+ task_hint (str): "classification", "regression", or None.
292
+
293
+ Returns:
294
+ dict: Model results and metrics.
295
+ """
296
+ df = pd.DataFrame.from_dict(data)
297
+ results = {}
298
+
299
+ target_col = detect_target_column(df)
300
+ if not target_col:
301
+ results["note"] = "No target column detected. Check column names and data."
302
+ return results
303
+
304
+ X = df.drop(columns=[target_col])
305
+ y = df[target_col]
306
+
307
+ # One-hot encode X
308
+ X = pd.get_dummies(X, drop_first=True)
309
+
310
+ if X.shape[1] == 0:
311
+ results["error"] = "No valid features after encoding."
312
+ return results
313
+
314
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
315
+
316
+ # Detect task
317
+ if task_hint:
318
+ task = task_hint
319
+ elif y.dtype in ["object", "category"] or y.nunique() <= 20:
320
+ task = "classification"
321
+ else:
322
+ task = "regression"
323
+
324
+ try:
325
+ if task == "classification":
326
+ models = {
327
+ "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
328
+ "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42)
329
+ }
330
+ results["task"] = "classification"
331
+ best_acc = 0
332
+ for name, model in models.items():
333
+ model.fit(X_train, y_train)
334
+ preds = model.predict(X_test)
335
+ acc = accuracy_score(y_test, preds)
336
+ if acc > best_acc:
337
+ best_acc = acc
338
+ results["accuracy"] = acc
339
+ results["best_model"] = name
340
+ results["report"] = classification_report(y_test, preds, zero_division=0)
341
+ if hasattr(model, "feature_importances_"):
342
+ results["feature_importance"] = dict(zip(X.columns, model.feature_importances_))
343
+
344
+ else:
345
+ models = {
346
+ "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
347
+ "LinearRegression": LinearRegression()
348
+ }
349
+ results["task"] = "regression"
350
+ best_r2 = -float("inf")
351
+ for name, model in models.items():
352
+ model.fit(X_train, y_train)
353
+ preds = model.predict(X_test)
354
+ r2 = r2_score(y_test, preds)
355
+ if r2 > best_r2:
356
+ best_r2 = r2
357
+ results["r2_score"] = r2
358
+ results["mse"] = mean_squared_error(y_test, preds)
359
+ results["best_model"] = name
360
+ best_model = model # Keep best model
361
+ if hasattr(model, "feature_importances_"):
362
+ results["feature_importance"] = dict(zip(X.columns, model.feature_importances_))
363
+ # βœ… Save the best model to a temporary file
364
+ model_dir = tempfile.mkdtemp()
365
+ model_path = os.path.join(model_dir, f"trained_model_{task}.pkl")
366
+ joblib.dump({
367
+ "model": best_model,
368
+ "task": task,
369
+ "target_column": target_col,
370
+ "features": X.columns.tolist()
371
+ }, model_path)
372
+
373
+ results["model_download_path"] = model_path
374
+ results["model_info"] = f"Best model: {results['best_model']} | Task: {task} | Target: {target_col}"
375
+
376
+ except Exception as e:
377
+ results["error"] = f"Model training failed: {str(e)}"
378
+
379
+ return results
380
+
381
+
382
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
383
+ # 🧠 Initialize the AI Agent
384
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
385
+
386
+ agent = CodeAgent(
387
+ tools=[LoadData, CleanData, EDA, AutoML],
388
+ model=InferenceClientModel(
389
+ model_id="Qwen/Qwen2.5-Coder-1.5B-Instruct",
390
+ token=os.environ["HF_TOKEN"],
391
+ provider="Featherless AI",
392
+ max_tokens=4048
393
+ ),
394
+ additional_authorized_imports=[
395
+ "pandas", "matplotlib.pyplot", "seaborn", "PIL", "sklearn", "io", "os","joblib","tempfile"
396
+ ],
397
+ max_steps=10,
398
+ )
399
+
400
+
401
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
402
+ # πŸ–ΌοΈ Gradio Interface
403
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
404
+
405
+ def analyze_data(file):
406
+ if "HF_TOKEN" not in os.environ or not os.environ["HF_TOKEN"]:
407
+ return "❌ Please enter your HF token first!", [], None
408
+
409
+ filepath = file.name
410
+ prompt = f"""
411
+ Load the data from '{filepath}', then clean it using CleanData with outlier handling.
412
+ Run EDA to analyze data quality, distributions, and detect the target variable.
413
+ If a target is found, run AutoML to train the best model.
414
+ Return all insights, metrics, and visualizations.
415
+ """
416
+ try:
417
+ results = agent.run(prompt)
418
+ except Exception as e:
419
+ results = {"error": f"Agent failed: {str(e)}"}
420
+
421
+ # === Text Report ===
422
+ text_output = ""
423
+
424
+ if "error" in results:
425
+ text_output = f"❌ Error: {results['error']}"
426
+ else:
427
+ summary = results.get("summary", "No summary.")
428
+ missing_vals = results.get("missing_values", {})
429
+ top_corr = results.get("top_correlations", {})
430
+ outliers = results.get("outliers", {})
431
+ recs = results.get("recommendations", [])
432
+ detected_target = results.get("detected_target", "Unknown")
433
+
434
+ text_output += f"### πŸ“Š Dataset Overview\n"
435
+ text_output += f"**Detected Target:** `{detected_target}`\n\n"
436
+ text_output += f"### Summary Stats\n{summary}\n\n"
437
+ text_output += f"### Missing Values\n{missing_vals}\n\n"
438
+ text_output += f"### Top Correlations\n{top_corr}\n\n"
439
+ text_output += f"### Outliers\n{outliers}\n\n"
440
+ text_output += f"### Recommendations\n" + "\n".join([f"- {r}" for r in recs]) + "\n\n"
441
+
442
+ if "task" in results:
443
+ task = results["task"]
444
+ text_output += f"### πŸ€– AutoML Results ({task.title()})\n"
445
+ text_output += f"**Best Model:** {results.get('best_model', 'Unknown')}\n"
446
+ if task == "classification":
447
+ text_output += f"**Accuracy:** {results['accuracy']:.3f}\n\n"
448
+ text_output += f"```\n{results['report']}\n```\n"
449
+ else:
450
+ text_output += f"**RΒ²:** {results['r2_score']:.3f}, **MSE:** {results['mse']:.3f}\n"
451
+
452
+ feat_imp = sorted(results.get("feature_importance", {}).items(), key=lambda x: x[1], reverse=True)[:5]
453
+ text_output += f"### Top Features\n" + "\n".join([f"- `{f}`: {imp:.3f}" for f, imp in feat_imp])
454
+
455
+ # === Collect Plots ===
456
+ plots = []
457
+ for key, value in results.items():
458
+ if isinstance(value, Image.Image):
459
+ plots.append(value)
460
+
461
+ model_file = results.get("model_download_path", None)
462
+ if model_file and os.path.exists(model_file):
463
+ model_download_output = model_file
464
+ else:
465
+ model_download_output = None # No file to download
466
+
467
+ return text_output, plots, model_download_output
468
+
469
+
470
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
471
+ # πŸš€ Launch Gradio App
472
+ # β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
473
+
474
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
475
+ gr.Markdown("# 🧠 AI Data Analyst Agent with AutoML & Smart Target Detection")
476
+ gr.Markdown("Enter your Hugging Face token, then upload a CSV file.")
477
+
478
+ token_box = gr.Textbox(label="πŸ”‘ Hugging Face Token", placeholder="Enter your HF token here...", type="password")
479
+ token_status = gr.Markdown()
480
+ token_box.submit(set_hf_token, inputs=token_box, outputs=token_status)
481
+
482
+ with gr.Row():
483
+ file_input = gr.File(label="πŸ“ Upload CSV")
484
+ with gr.Row():
485
+ text_output = gr.Textbox(label="πŸ“ Analysis Report", lines=24)
486
+ with gr.Row():
487
+ plots_output = gr.Gallery(label="πŸ“Š EDA & Model Plots", scale=2)
488
+ with gr.Row():
489
+ model_download = gr.File(label="πŸ’Ύ Download Trained Model (.pkl)")
490
+
491
+ file_input.upload(analyze_data, inputs=file_input, outputs=[text_output, plots_output, model_download])
492
+
493
+ # Launch
494
+ if __name__ == "__main__":
495
+ demo.launch(share=True) # Use share=True for public link