Starberry15 commited on
Commit
8ee580a
Β·
verified Β·
1 Parent(s): 1eb934c

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +256 -128
src/streamlit_app.py CHANGED
@@ -1,140 +1,268 @@
1
- # streamlit_data_analysis_app.py
2
- # Streamlit Data Analysis App with LLM-powered cleaning and insights
3
- # Features:
4
- # - Upload CSV / Excel
5
- # - Dataset cleaned automatically by Qwen 2.5 Coder
6
- # - Preprocessing, visualizations, summaries
7
- # - Insights via Mistral, Mixtral, Qwen 14B, Gemini
8
-
9
  import os
10
- import streamlit as st
11
  import pandas as pd
12
- import matplotlib.pyplot as plt
13
- import seaborn as sns
14
- from huggingface_hub import InferenceClient
15
- import google.generativeai as genai
16
-
17
- # ---------- CONFIG ----------
18
- st.set_page_config(page_title="LLM-Powered Data Analysis", layout="wide")
19
-
20
- # ---------- API KEYS ----------
21
- try:
22
- GEMINI_API_KEY = st.secrets.get("GEMINI_API_KEY")
23
- except:
24
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
25
-
26
- HF_API_KEY = st.secrets.get("HF_API_KEY") or os.getenv("HF_API_KEY")
27
-
28
- if GEMINI_API_KEY:
29
- genai.configure(api_key=GEMINI_API_KEY)
30
- hf_client = InferenceClient(token=HF_API_KEY) if HF_API_KEY else None
31
-
32
- # ---------- UTILITIES ----------
33
- def read_file(uploaded_file):
34
- name = uploaded_file.name.lower()
35
- if name.endswith(('.csv', '.txt')):
36
- return pd.read_csv(uploaded_file)
37
- elif name.endswith(('.xls', '.xlsx')):
38
- return pd.read_excel(uploaded_file)
39
- else:
40
- raise ValueError("Unsupported file type. Please upload CSV or Excel.")
41
-
42
- def call_hf_model(prompt: str, model: str):
43
- """Call Hugging Face inference API"""
44
- if not hf_client:
45
- return "⚠️ HF API key not found."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  try:
47
- output = hf_client.text_generation(model=model, inputs=prompt, max_new_tokens=1024)
48
- return output[0]["generated_text"]
 
 
 
 
 
49
  except Exception as e:
50
- return f"❌ HF call failed: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
- def call_gemini(prompt: str, model="gemini-2.0-flash"):
53
- if not GEMINI_API_KEY:
54
- return "⚠️ Gemini API key not found."
55
  try:
56
- model_obj = genai.GenerativeModel(model)
57
- response = model_obj.generate_content(prompt)
58
- return response.text
59
  except Exception as e:
60
- return f"❌ Gemini call failed: {e}"
 
61
 
62
- # ---------- STREAMLIT UI ----------
63
- st.title("πŸ“Š LLM-Powered Data Analysis App")
64
- st.markdown("Upload a dataset and let AI clean & analyze it automatically!")
 
 
 
 
65
 
66
- # Sidebar options
67
- with st.sidebar:
68
- st.header("βš™οΈ Options")
69
- cleaner_model = st.selectbox("Dataset Cleaner", ["Qwen-2.5-coder"])
70
- analysis_model = st.selectbox("Analysis / Insights Model", ["mistralai/Mistral-7B-Instruct", "mixtral/Mixtral-8B", "Qwen-14B"])
71
- use_gemini = st.checkbox("Enable Gemini Insights", value=False)
72
-
73
- uploaded_file = st.file_uploader("πŸ“‚ Upload CSV or Excel file", type=['csv', 'xls', 'xlsx', 'txt'])
74
-
75
- if uploaded_file:
76
- # Save file to /tmp for Spaces
77
- temp_path = os.path.join("/tmp", uploaded_file.name)
78
- with open(temp_path, "wb") as f:
79
- f.write(uploaded_file.getbuffer())
80
- with open(temp_path, "rb") as f:
81
- raw_df = read_file(f)
82
-
83
- st.subheader("Raw Data Preview")
84
- st.dataframe(raw_df.head())
85
-
86
- # ---------- DATA CLEANING ----------
87
- st.subheader("Cleaning dataset with AI...")
88
- prompt_clean = f"Clean the following dataset and return a valid CSV. Only return CSV text. Input:\n{raw_df.to_csv(index=False)}"
89
- cleaned_csv_text = call_hf_model(prompt_clean, model=cleaner_model)
90
-
91
- from io import StringIO
92
- cleaned_df = pd.read_csv(StringIO(cleaned_csv_text))
93
- st.success("βœ… Dataset cleaned!")
94
- st.dataframe(cleaned_df.head())
95
-
96
- # ---------- SUMMARY ----------
97
- st.subheader("Dataset Summary")
98
- st.write(f"Shape: {cleaned_df.shape}")
99
- st.dataframe(cleaned_df.describe(include='all'))
100
-
101
- # ---------- VISUALIZATIONS ----------
102
- st.subheader("Visualizations")
103
- viz_col = st.selectbox("Select column", options=cleaned_df.columns)
104
- viz_type = st.selectbox("Visualization type", ['Histogram', 'Boxplot', 'Bar (categorical)', 'Scatter', 'Correlation heatmap'])
105
-
106
- if viz_type == 'Scatter':
107
- second_col = st.selectbox("Second column", options=[c for c in cleaned_df.columns if c != viz_col])
108
-
109
- if st.button("Show Visualization"):
110
- fig, ax = plt.subplots(figsize=(8, 5))
111
- try:
112
- if viz_type == 'Histogram':
113
- sns.histplot(cleaned_df[viz_col], kde=True, ax=ax)
114
- elif viz_type == 'Boxplot':
115
- sns.boxplot(x=cleaned_df[viz_col], ax=ax)
116
- elif viz_type == 'Bar (categorical)':
117
- counts = cleaned_df[viz_col].astype(str).value_counts().head(20)
118
- sns.barplot(x=counts.values, y=counts.index, ax=ax)
119
- elif viz_type == 'Scatter':
120
- sns.scatterplot(x=cleaned_df[viz_col], y=cleaned_df[second_col], ax=ax)
121
- elif viz_type == 'Correlation heatmap':
122
- corr = cleaned_df.select_dtypes(include=['number']).corr()
123
- sns.heatmap(corr, annot=True, cmap='coolwarm', ax=ax)
124
- st.pyplot(fig)
125
- except Exception as e:
126
- st.error(f"Visualization failed: {e}")
127
-
128
- # ---------- INSIGHTS ----------
129
- st.subheader("🧠 AI Insights")
130
- user_q = st.text_area("Enter your question (optional):")
131
- if st.button("Get AI Insights"):
132
- prompt_analysis = f"Dataset:\n{cleaned_df.to_csv(index=False)}\nQuestion: {user_q if user_q else 'Provide a summary and key patterns.'}"
133
- if use_gemini:
134
- resp = call_gemini(prompt_analysis)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  else:
136
- resp = call_hf_model(prompt_analysis, model=analysis_model)
137
- st.write(resp)
138
 
 
 
 
 
 
 
 
139
  else:
140
- st.info("πŸ“₯ Upload a dataset to begin.")
 
 
 
 
 
 
 
 
 
1
  import os
 
2
  import pandas as pd
3
+ import numpy as np
4
+ import streamlit as st
5
+ import plotly.express as px
6
+ import plotly.figure_factory as ff
7
+ from dotenv import load_dotenv
8
+ from huggingface_hub import InferenceClient, login
9
+ from io import StringIO
10
+
11
+ # ======================================================
12
+ # βš™οΈ APP CONFIGURATION
13
+ # ======================================================
14
+ st.set_page_config(page_title="πŸ“Š Smart Data Analyst Pro", layout="wide")
15
+ st.title("πŸ“Š Smart Data Analyst Pro")
16
+ st.caption("AI that cleans, analyzes, and visualizes your data β€” powered by Hugging Face Inference API.")
17
+
18
+ # ======================================================
19
+ # πŸ” Load Environment Variables
20
+ # ======================================================
21
+ load_dotenv()
22
+ HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_API_KEY")
23
+ if not HF_TOKEN:
24
+ st.error("❌ Missing HF_TOKEN. Please set it in your .env file.")
25
+ else:
26
+ login(token=HF_TOKEN)
27
+
28
+ # ======================================================
29
+ # 🧠 MODEL SETUP
30
+ # ======================================================
31
+ with st.sidebar:
32
+ st.header("βš™οΈ Model Settings")
33
+
34
+ CLEANER_MODEL = st.selectbox(
35
+ "Select Cleaner Model:",
36
+ [
37
+ "Qwen/Qwen2.5-Coder-7B-Instruct",
38
+ "meta-llama/Meta-Llama-3-8B-Instruct",
39
+ "microsoft/Phi-3-mini-4k-instruct",
40
+ "mistralai/Mistral-7B-Instruct-v0.3"
41
+ ],
42
+ index=0
43
+ )
44
+
45
+ ANALYST_MODEL = st.selectbox(
46
+ "Select Analysis Model:",
47
+ [
48
+ "Qwen/Qwen2.5-14B-Instruct",
49
+ "mistralai/Mistral-7B-Instruct-v0.3",
50
+ "HuggingFaceH4/zephyr-7b-beta"
51
+ ],
52
+ index=0
53
+ )
54
+
55
+ temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
56
+ max_tokens = st.slider("Max Tokens", 128, 2048, 512)
57
+
58
+ # Initialize inference clients
59
+ cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
60
+ analyst_client = InferenceClient(model=ANALYST_MODEL, token=HF_TOKEN)
61
+
62
+ # ======================================================
63
+ # 🧩 SAFE GENERATION FUNCTION
64
+ # ======================================================
65
+ def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
66
+ """
67
+ Tries text_generation first, then falls back to chat_completion if not supported.
68
+ Returns plain string content.
69
+ """
70
  try:
71
+ resp = client.text_generation(
72
+ prompt,
73
+ temperature=temperature,
74
+ max_new_tokens=max_tokens,
75
+ return_full_text=False,
76
+ )
77
+ return resp.strip()
78
  except Exception as e:
79
+ if "Supported task: conversational" in str(e) or "not supported" in str(e):
80
+ chat_resp = client.chat_completion(
81
+ messages=[{"role": "user", "content": prompt}],
82
+ max_tokens=max_tokens,
83
+ temperature=temperature,
84
+ )
85
+ return chat_resp["choices"][0]["message"]["content"].strip()
86
+ else:
87
+ raise e
88
+
89
+ # ======================================================
90
+ # 🧩 SMART DATA CLEANING
91
+ # ======================================================
92
+ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
93
+ """Backup rule-based cleaner."""
94
+ df = df.copy()
95
+ df.dropna(axis=1, how="all", inplace=True)
96
+ df.columns = [c.strip().replace(" ", "_").lower() for c in df.columns]
97
+ for col in df.columns:
98
+ if df[col].dtype == "O":
99
+ if not df[col].mode().empty:
100
+ df[col].fillna(df[col].mode()[0], inplace=True)
101
+ else:
102
+ df[col].fillna("Unknown", inplace=True)
103
+ else:
104
+ df[col].fillna(df[col].median(), inplace=True)
105
+ df.drop_duplicates(inplace=True)
106
+ return df
107
+
108
+
109
+ def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
110
+ """Cleans the dataset using the selected AI model. Falls back gracefully if the model fails."""
111
+ raw_preview = df.head(5).to_csv(index=False)
112
+ prompt = f"""
113
+ You are a professional data cleaning assistant.
114
+ Clean and standardize the dataset below dynamically:
115
+ 1. Handle missing values
116
+ 2. Fix column name inconsistencies
117
+ 3. Convert data types (dates, numbers, categories)
118
+ 4. Remove irrelevant or duplicate rows
119
+ Return ONLY a valid CSV text (no markdown, no explanations).
120
+
121
+ --- RAW SAMPLE ---
122
+ {raw_preview}
123
+ """
124
 
 
 
 
125
  try:
126
+ cleaned_str = safe_hf_generate(cleaner_client, prompt, temperature=0.1, max_tokens=1024)
 
 
127
  except Exception as e:
128
+ st.warning(f"⚠️ AI cleaning failed: {e}")
129
+ return fallback_clean(df)
130
 
131
+ cleaned_str = (
132
+ cleaned_str.replace("```csv", "")
133
+ .replace("```", "")
134
+ .replace("###", "")
135
+ .replace(";", ",")
136
+ .strip()
137
+ )
138
 
139
+ lines = cleaned_str.splitlines()
140
+ lines = [line for line in lines if "," in line and not line.lower().startswith(("note", "summary"))]
141
+ cleaned_str = "\n".join(lines)
142
+
143
+ try:
144
+ cleaned_df = pd.read_csv(StringIO(cleaned_str), on_bad_lines="skip")
145
+ cleaned_df = cleaned_df.dropna(axis=1, how="all")
146
+ cleaned_df.columns = [c.strip().replace(" ", "_").lower() for c in cleaned_df.columns]
147
+ return cleaned_df
148
+ except Exception as e:
149
+ st.warning(f"⚠️ AI CSV parse failed: {e}")
150
+ return fallback_clean(df)
151
+
152
+
153
+ def summarize_dataframe(df: pd.DataFrame) -> str:
154
+ """Generate a concise summary of the dataframe."""
155
+ lines = [f"Rows: {len(df)} | Columns: {len(df.columns)}", "Column summaries:"]
156
+ for col in df.columns[:10]:
157
+ non_null = int(df[col].notnull().sum())
158
+ if pd.api.types.is_numeric_dtype(df[col]):
159
+ desc = df[col].describe().to_dict()
160
+ mean = float(desc.get("mean", np.nan))
161
+ median = float(df[col].median()) if non_null > 0 else None
162
+ lines.append(f"- {col}: mean={mean:.3f}, median={median}, non_null={non_null}")
163
+ else:
164
+ top = df[col].value_counts().head(3).to_dict()
165
+ lines.append(f"- {col}: top_values={top}, non_null={non_null}")
166
+ return "\n".join(lines)
167
+
168
+
169
+ def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
170
+ """Send the dataframe and user query to the analysis model for interpretation."""
171
+ df_summary = summarize_dataframe(df)
172
+ sample = df.head(6).to_csv(index=False)
173
+ prompt = f"""
174
+ You are a professional data analyst.
175
+ Analyze the dataset '{dataset_name}' and answer the user's question.
176
+
177
+ --- SUMMARY ---
178
+ {df_summary}
179
+
180
+ --- SAMPLE DATA ---
181
+ {sample}
182
+
183
+ --- USER QUESTION ---
184
+ {user_query}
185
+
186
+ Respond with:
187
+ 1. Key insights and patterns
188
+ 2. Quantitative findings
189
+ 3. Notable relationships or anomalies
190
+ 4. Data-driven recommendations
191
+ """
192
+
193
+ try:
194
+ response = safe_hf_generate(analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
195
+ return response
196
+ except Exception as e:
197
+ return f"⚠️ Analysis failed: {e}"
198
+
199
+ # ======================================================
200
+ # πŸš€ MAIN APP LOGIC
201
+ # ======================================================
202
+ uploaded = st.file_uploader("πŸ“Ž Upload CSV or Excel file", type=["csv", "xlsx"])
203
+
204
+ if uploaded:
205
+ df = pd.read_csv(uploaded) if uploaded.name.endswith(".csv") else pd.read_excel(uploaded)
206
+
207
+ with st.spinner("🧼 AI Cleaning your dataset..."):
208
+ cleaned_df = ai_clean_dataset(df)
209
+
210
+ st.subheader("βœ… Cleaned Dataset Preview")
211
+ st.dataframe(cleaned_df.head(), use_container_width=True)
212
+
213
+ with st.expander("πŸ“‹ Cleaning Summary", expanded=False):
214
+ st.text(summarize_dataframe(cleaned_df))
215
+
216
+ with st.expander("πŸ“ˆ Quick Visualizations", expanded=True):
217
+ numeric_cols = cleaned_df.select_dtypes(include="number").columns.tolist()
218
+ categorical_cols = cleaned_df.select_dtypes(exclude="number").columns.tolist()
219
+
220
+ viz_type = st.selectbox(
221
+ "Visualization Type",
222
+ ["Scatter Plot", "Histogram", "Box Plot", "Correlation Heatmap", "Categorical Count"]
223
+ )
224
+
225
+ if viz_type == "Scatter Plot" and len(numeric_cols) >= 2:
226
+ x = st.selectbox("X-axis", numeric_cols)
227
+ y = st.selectbox("Y-axis", numeric_cols, index=min(1, len(numeric_cols)-1))
228
+ color = st.selectbox("Color", ["None"] + categorical_cols)
229
+ fig = px.scatter(cleaned_df, x=x, y=y, color=None if color=="None" else color)
230
+ st.plotly_chart(fig, use_container_width=True)
231
+
232
+ elif viz_type == "Histogram" and numeric_cols:
233
+ col = st.selectbox("Column", numeric_cols)
234
+ fig = px.histogram(cleaned_df, x=col, nbins=30)
235
+ st.plotly_chart(fig, use_container_width=True)
236
+
237
+ elif viz_type == "Box Plot" and numeric_cols:
238
+ col = st.selectbox("Column", numeric_cols)
239
+ fig = px.box(cleaned_df, y=col)
240
+ st.plotly_chart(fig, use_container_width=True)
241
+
242
+ elif viz_type == "Correlation Heatmap" and len(numeric_cols) > 1:
243
+ corr = cleaned_df[numeric_cols].corr()
244
+ fig = ff.create_annotated_heatmap(
245
+ z=corr.values,
246
+ x=list(corr.columns),
247
+ y=list(corr.index),
248
+ annotation_text=corr.round(2).values,
249
+ showscale=True
250
+ )
251
+ st.plotly_chart(fig, use_container_width=True)
252
+
253
+ elif viz_type == "Categorical Count" and categorical_cols:
254
+ cat = st.selectbox("Category", categorical_cols)
255
+ fig = px.bar(cleaned_df[cat].value_counts().reset_index(), x="index", y=cat)
256
+ st.plotly_chart(fig, use_container_width=True)
257
  else:
258
+ st.warning("⚠️ Not enough columns for this visualization type.")
 
259
 
260
+ st.subheader("πŸ’¬ Ask AI About Your Data")
261
+ user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")
262
+ if st.button("Analyze with AI", use_container_width=True) and user_query:
263
+ with st.spinner("πŸ€– Interpreting data..."):
264
+ result = query_analysis_model(cleaned_df, user_query, uploaded.name)
265
+ st.markdown("### πŸ’‘ Insights")
266
+ st.markdown(result)
267
  else:
268
+ st.info("πŸ“₯ Upload a dataset to begin smart analysis.")