Starberry15 commited on
Commit
1eb934c
Β·
verified Β·
1 Parent(s): 8558e29

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +62 -158
src/streamlit_app.py CHANGED
@@ -1,150 +1,55 @@
1
  # streamlit_data_analysis_app.py
2
- # Streamlit Data Analysis App using Gemini 2.0 Flash (Free-tier)
3
  # Features:
4
  # - Upload CSV / Excel
5
- # - Automatic cleaning & standardization
6
- # - Preprocessing (imputation, encoding, scaling)
7
- # - Quick visualizations
8
- # - Dataset summary + preview
9
- # - Insights powered by Gemini 2.0 Flash (Google AI)
10
 
11
  import os
12
  import streamlit as st
13
  import pandas as pd
14
- import numpy as np
15
  import matplotlib.pyplot as plt
16
  import seaborn as sns
17
- from sklearn.impute import SimpleImputer
18
- from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
19
- from sklearn.compose import ColumnTransformer
20
- from sklearn.pipeline import Pipeline
21
  import google.generativeai as genai
22
 
23
- # ---------- CONFIGURATION ----------
24
- st.set_page_config(page_title="Data Analysis App", layout="wide")
25
 
26
- # Load Gemini API key safely
27
  try:
28
- GEMINI_API_KEY = st.secrets["GEMINI_API_KEY"]
29
- except Exception:
30
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
31
 
 
 
32
  if GEMINI_API_KEY:
33
  genai.configure(api_key=GEMINI_API_KEY)
34
- st.success("βœ… Gemini API key loaded successfully.")
35
- else:
36
- st.warning("⚠️ No Gemini API key found. Please add GEMINI_API_KEY to .env or Streamlit secrets.")
37
 
38
  # ---------- UTILITIES ----------
39
  def read_file(uploaded_file):
40
- """Read uploaded file and return DataFrame"""
41
  name = uploaded_file.name.lower()
 
 
 
 
 
 
 
 
 
 
 
42
  try:
43
- if name.endswith(('.csv', '.txt')):
44
- # βœ… FIX: Remove 'errors' argument
45
- return pd.read_csv(uploaded_file, encoding="utf-8")
46
- elif name.endswith(('.xls', '.xlsx')):
47
- return pd.read_excel(uploaded_file)
48
- else:
49
- raise ValueError("Unsupported file type. Please upload CSV or Excel.")
50
- except UnicodeDecodeError:
51
- # fallback encoding if utf-8 fails
52
- return pd.read_csv(uploaded_file, encoding="latin1")
53
  except Exception as e:
54
- st.error(f"❌ File reading failed: {e}")
55
- raise
56
-
57
-
58
- def clean_column_name(col: str) -> str:
59
- col = str(col).strip().lower().replace("\n", " ").replace("\t", " ")
60
- col = "_".join(col.split())
61
- col = ''.join(c for c in col if (c.isalnum() or c == '_'))
62
- while '__' in col:
63
- col = col.replace('__', '_')
64
- return col
65
-
66
- def standardize_dataframe(df: pd.DataFrame, drop_all_nan_cols: bool = True) -> pd.DataFrame:
67
- df = df.copy()
68
- for c in df.select_dtypes(include=['object']).columns:
69
- df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
70
- df.columns = [clean_column_name(c) for c in df.columns]
71
- if drop_all_nan_cols:
72
- df.dropna(axis=1, how='all', inplace=True)
73
- for c in df.columns:
74
- if df[c].dtype == object:
75
- sample = df[c].dropna().astype(str).head(20)
76
- if not sample.empty:
77
- parsed = pd.to_datetime(sample, errors='coerce')
78
- if parsed.notna().sum() / len(sample) > 0.6:
79
- df[c] = pd.to_datetime(df[c], errors='coerce')
80
- return df
81
-
82
- def summarize_dataframe(df: pd.DataFrame, max_rows: int = 5):
83
- summary = {'shape': df.shape, 'columns': [], 'preview': df.head(max_rows).to_dict(orient='records')}
84
- for c in df.columns:
85
- info = {'name': c, 'dtype': str(df[c].dtype), 'n_missing': int(df[c].isna().sum()), 'n_unique': int(df[c].nunique(dropna=True))}
86
- if pd.api.types.is_numeric_dtype(df[c]):
87
- info['summary'] = df[c].describe().to_dict()
88
- elif pd.api.types.is_datetime64_any_dtype(df[c]):
89
- info['summary'] = {'min': str(df[c].min()), 'max': str(df[c].max())}
90
- else:
91
- info['top_values'] = df[c].astype(str).value_counts().head(5).to_dict()
92
- summary['columns'].append(info)
93
- return summary
94
-
95
- def prepare_preprocessing_pipeline(df: pd.DataFrame, impute_strategy_num='median', scale_numeric=True, encode_categorical='onehot'):
96
- numeric_cols = list(df.select_dtypes(include=[np.number]).columns)
97
- cat_cols = list(df.select_dtypes(include=['object', 'category', 'bool']).columns)
98
- transformers = []
99
- if numeric_cols:
100
- num_pipe = [('imputer', SimpleImputer(strategy=impute_strategy_num))]
101
- if scale_numeric:
102
- num_pipe.append(('scaler', StandardScaler()))
103
- transformers.append(('num', Pipeline(num_pipe), numeric_cols))
104
- if cat_cols:
105
- if encode_categorical == 'onehot':
106
- cat_pipe = Pipeline([
107
- ('imputer', SimpleImputer(strategy='most_frequent')),
108
- ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
109
- ])
110
- else:
111
- cat_pipe = Pipeline([
112
- ('imputer', SimpleImputer(strategy='most_frequent')),
113
- ('ord', OrdinalEncoder())
114
- ])
115
- transformers.append(('cat', cat_pipe, cat_cols))
116
- return ColumnTransformer(transformers), numeric_cols + cat_cols
117
-
118
- def apply_preprocessing(df: pd.DataFrame, preprocessor: ColumnTransformer) -> pd.DataFrame:
119
- X = preprocessor.fit_transform(df)
120
- feature_names = []
121
- for name, trans, cols in preprocessor.transformers_:
122
- if name == 'num':
123
- feature_names += cols
124
- elif name == 'cat':
125
- try:
126
- ohe = trans.named_steps['onehot']
127
- for col, cats in zip(cols, ohe.categories_):
128
- feature_names += [f"{col}__{c}" for c in cats]
129
- except Exception:
130
- feature_names += cols
131
- return pd.DataFrame(X, columns=feature_names)
132
-
133
- # ---------- LLM (Gemini only) ----------
134
- def build_dataset_prompt(summary, user_question=None):
135
- s = [f"Dataset shape: {summary['shape'][0]} rows, {summary['shape'][1]} columns."]
136
- for c in summary['columns']:
137
- s.append(f"- {c['name']} ({c['dtype']}) missing={c['n_missing']} unique={c['n_unique']}")
138
- s.append("Preview:")
139
- for row in summary['preview']:
140
- s.append(str(row))
141
- if user_question:
142
- s.append(f"User question: {user_question}")
143
- else:
144
- s.append("Please provide a summary, notable patterns, and suggestions for visualizations.")
145
- return "\n".join(s)
146
 
147
- def call_llm_gemini(prompt: str, model="gemini-2.0-flash"):
148
  if not GEMINI_API_KEY:
149
  return "⚠️ Gemini API key not found."
150
  try:
@@ -155,49 +60,45 @@ def call_llm_gemini(prompt: str, model="gemini-2.0-flash"):
155
  return f"❌ Gemini call failed: {e}"
156
 
157
  # ---------- STREAMLIT UI ----------
158
- st.title("πŸ“Š Data Analysis & Cleaning App (Gemini-Powered)")
159
- st.markdown("Upload CSV or Excel, clean and preprocess it, visualize data, and get insights powered by **Gemini 2.0 Flash**.")
160
 
 
161
  with st.sidebar:
162
  st.header("βš™οΈ Options")
163
- st.info("Using **Gemini 2.0 Flash (Google AI)** for insights.")
164
- impute_strategy_num = st.selectbox("Numeric imputation", ['mean', 'median', 'most_frequent'])
165
- encode_categorical = st.selectbox("Categorical encoding", ['onehot', 'ordinal'])
166
- scale_numeric = st.checkbox("Scale numeric features", True)
167
- show_raw_preview = st.checkbox("Show raw preview", True)
168
 
169
  uploaded_file = st.file_uploader("πŸ“‚ Upload CSV or Excel file", type=['csv', 'xls', 'xlsx', 'txt'])
170
 
171
  if uploaded_file:
172
- # βœ… FIX: Save to /tmp for Hugging Face Spaces compatibility
173
  temp_path = os.path.join("/tmp", uploaded_file.name)
174
  with open(temp_path, "wb") as f:
175
  f.write(uploaded_file.getbuffer())
176
  with open(temp_path, "rb") as f:
177
  raw_df = read_file(f)
178
 
179
- if show_raw_preview:
180
- st.subheader("Raw Data Preview")
181
- st.dataframe(raw_df.head())
182
-
183
- st.subheader("Data Cleaning & Standardization")
184
- cleaned_df = standardize_dataframe(raw_df)
185
- st.write(f"βœ… Cleaned data shape: {cleaned_df.shape}")
 
 
 
 
186
  st.dataframe(cleaned_df.head())
187
 
188
- st.subheader("Summary")
189
- summary = summarize_dataframe(cleaned_df)
190
- st.write(f"Shape: {summary['shape']}")
191
- st.json(summary['columns'])
192
-
193
- st.subheader("Preprocessing")
194
- if st.button("Generate Preprocessing Pipeline"):
195
- preproc, _ = prepare_preprocessing_pipeline(cleaned_df, impute_strategy_num, scale_numeric, encode_categorical)
196
- processed_df = apply_preprocessing(cleaned_df, preproc)
197
- st.success("Preprocessing complete!")
198
- st.dataframe(processed_df.head())
199
- st.download_button("⬇️ Download Processed CSV", processed_df.to_csv(index=False), "processed_data.csv")
200
 
 
201
  st.subheader("Visualizations")
202
  viz_col = st.selectbox("Select column", options=cleaned_df.columns)
203
  viz_type = st.selectbox("Visualization type", ['Histogram', 'Boxplot', 'Bar (categorical)', 'Scatter', 'Correlation heatmap'])
@@ -218,19 +119,22 @@ if uploaded_file:
218
  elif viz_type == 'Scatter':
219
  sns.scatterplot(x=cleaned_df[viz_col], y=cleaned_df[second_col], ax=ax)
220
  elif viz_type == 'Correlation heatmap':
221
- corr = cleaned_df.select_dtypes(include=[np.number]).corr()
222
  sns.heatmap(corr, annot=True, cmap='coolwarm', ax=ax)
223
  st.pyplot(fig)
224
  except Exception as e:
225
  st.error(f"Visualization failed: {e}")
226
 
227
- st.subheader("🧠 Ask Gemini for Insights")
 
228
  user_q = st.text_area("Enter your question (optional):")
229
- if st.button("Get Insights"):
230
- with st.spinner("Generating insights via Gemini..."):
231
- prompt = build_dataset_prompt(summary, user_q if user_q else None)
232
- llm_resp = call_llm_gemini(prompt)
233
- st.write(llm_resp)
 
 
234
 
235
  else:
236
- st.info("πŸ“₯ Upload a file to begin.")
 
1
  # streamlit_data_analysis_app.py
2
+ # Streamlit Data Analysis App with LLM-powered cleaning and insights
3
  # Features:
4
  # - Upload CSV / Excel
5
+ # - Dataset cleaned automatically by Qwen 2.5 Coder
6
+ # - Preprocessing, visualizations, summaries
7
+ # - Insights via Mistral, Mixtral, Qwen 14B, Gemini
 
 
8
 
9
  import os
10
  import streamlit as st
11
  import pandas as pd
 
12
  import matplotlib.pyplot as plt
13
  import seaborn as sns
14
+ from huggingface_hub import InferenceClient
 
 
 
15
  import google.generativeai as genai
16
 
17
+ # ---------- CONFIG ----------
18
+ st.set_page_config(page_title="LLM-Powered Data Analysis", layout="wide")
19
 
20
+ # ---------- API KEYS ----------
21
  try:
22
+ GEMINI_API_KEY = st.secrets.get("GEMINI_API_KEY")
23
+ except:
24
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
25
 
26
+ HF_API_KEY = st.secrets.get("HF_API_KEY") or os.getenv("HF_API_KEY")
27
+
28
  if GEMINI_API_KEY:
29
  genai.configure(api_key=GEMINI_API_KEY)
30
+ hf_client = InferenceClient(token=HF_API_KEY) if HF_API_KEY else None
 
 
31
 
32
  # ---------- UTILITIES ----------
33
  def read_file(uploaded_file):
 
34
  name = uploaded_file.name.lower()
35
+ if name.endswith(('.csv', '.txt')):
36
+ return pd.read_csv(uploaded_file)
37
+ elif name.endswith(('.xls', '.xlsx')):
38
+ return pd.read_excel(uploaded_file)
39
+ else:
40
+ raise ValueError("Unsupported file type. Please upload CSV or Excel.")
41
+
42
+ def call_hf_model(prompt: str, model: str):
43
+ """Call Hugging Face inference API"""
44
+ if not hf_client:
45
+ return "⚠️ HF API key not found."
46
  try:
47
+ output = hf_client.text_generation(model=model, inputs=prompt, max_new_tokens=1024)
48
+ return output[0]["generated_text"]
 
 
 
 
 
 
 
 
49
  except Exception as e:
50
+ return f"❌ HF call failed: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ def call_gemini(prompt: str, model="gemini-2.0-flash"):
53
  if not GEMINI_API_KEY:
54
  return "⚠️ Gemini API key not found."
55
  try:
 
60
  return f"❌ Gemini call failed: {e}"
61
 
62
  # ---------- STREAMLIT UI ----------
63
+ st.title("πŸ“Š LLM-Powered Data Analysis App")
64
+ st.markdown("Upload a dataset and let AI clean & analyze it automatically!")
65
 
66
+ # Sidebar options
67
  with st.sidebar:
68
  st.header("βš™οΈ Options")
69
+ cleaner_model = st.selectbox("Dataset Cleaner", ["Qwen-2.5-coder"])
70
+ analysis_model = st.selectbox("Analysis / Insights Model", ["mistralai/Mistral-7B-Instruct", "mixtral/Mixtral-8B", "Qwen-14B"])
71
+ use_gemini = st.checkbox("Enable Gemini Insights", value=False)
 
 
72
 
73
  uploaded_file = st.file_uploader("πŸ“‚ Upload CSV or Excel file", type=['csv', 'xls', 'xlsx', 'txt'])
74
 
75
  if uploaded_file:
76
+ # Save file to /tmp for Spaces
77
  temp_path = os.path.join("/tmp", uploaded_file.name)
78
  with open(temp_path, "wb") as f:
79
  f.write(uploaded_file.getbuffer())
80
  with open(temp_path, "rb") as f:
81
  raw_df = read_file(f)
82
 
83
+ st.subheader("Raw Data Preview")
84
+ st.dataframe(raw_df.head())
85
+
86
+ # ---------- DATA CLEANING ----------
87
+ st.subheader("Cleaning dataset with AI...")
88
+ prompt_clean = f"Clean the following dataset and return a valid CSV. Only return CSV text. Input:\n{raw_df.to_csv(index=False)}"
89
+ cleaned_csv_text = call_hf_model(prompt_clean, model=cleaner_model)
90
+
91
+ from io import StringIO
92
+ cleaned_df = pd.read_csv(StringIO(cleaned_csv_text))
93
+ st.success("βœ… Dataset cleaned!")
94
  st.dataframe(cleaned_df.head())
95
 
96
+ # ---------- SUMMARY ----------
97
+ st.subheader("Dataset Summary")
98
+ st.write(f"Shape: {cleaned_df.shape}")
99
+ st.dataframe(cleaned_df.describe(include='all'))
 
 
 
 
 
 
 
 
100
 
101
+ # ---------- VISUALIZATIONS ----------
102
  st.subheader("Visualizations")
103
  viz_col = st.selectbox("Select column", options=cleaned_df.columns)
104
  viz_type = st.selectbox("Visualization type", ['Histogram', 'Boxplot', 'Bar (categorical)', 'Scatter', 'Correlation heatmap'])
 
119
  elif viz_type == 'Scatter':
120
  sns.scatterplot(x=cleaned_df[viz_col], y=cleaned_df[second_col], ax=ax)
121
  elif viz_type == 'Correlation heatmap':
122
+ corr = cleaned_df.select_dtypes(include=['number']).corr()
123
  sns.heatmap(corr, annot=True, cmap='coolwarm', ax=ax)
124
  st.pyplot(fig)
125
  except Exception as e:
126
  st.error(f"Visualization failed: {e}")
127
 
128
+ # ---------- INSIGHTS ----------
129
+ st.subheader("🧠 AI Insights")
130
  user_q = st.text_area("Enter your question (optional):")
131
+ if st.button("Get AI Insights"):
132
+ prompt_analysis = f"Dataset:\n{cleaned_df.to_csv(index=False)}\nQuestion: {user_q if user_q else 'Provide a summary and key patterns.'}"
133
+ if use_gemini:
134
+ resp = call_gemini(prompt_analysis)
135
+ else:
136
+ resp = call_hf_model(prompt_analysis, model=analysis_model)
137
+ st.write(resp)
138
 
139
  else:
140
+ st.info("πŸ“₯ Upload a dataset to begin.")