Spaces:

Starberry15
/

data_analysis

Sleeping

App Files Files Community

Starberry15 commited on Oct 22

Commit

f62d086

verified ·

1 Parent(s): 64b21b3

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +11 -75

src/streamlit_app.py CHANGED Viewed

@@ -29,9 +29,8 @@ else:
     login(token=HF_TOKEN)
 if GEMINI_API_KEY:
-    gemini_client = genai.Client(api_key=GEMINI_API_KEY)
 else:
-    gemini_client = None
     st.warning("⚠️ Gemini API key missing. Gemini 2.5 Flash will not work.")
 # ======================================================
@@ -63,9 +62,9 @@ with st.sidebar:
     )
     temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
-    max_tokens = st.slider("Max Tokens", 128, 2048, 512)
-# Initialize inference clients
 hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
 hf_analyst_client = None
 if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
@@ -75,7 +74,6 @@ if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
 # 🧩 SAFE GENERATION FUNCTION
 # ======================================================
 def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
-    """HF text generation fallback to chat_completion"""
     try:
         resp = client.text_generation(
             prompt,
@@ -113,9 +111,7 @@ def fallback_clean(df: pd.DataFrame) -> pd.DataFrame:
     df.drop_duplicates(inplace=True)
     return df
 def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
-    """Clean dataset using AI. Full dataset sent for thorough cleaning."""
     csv_text = df.to_csv(index=False)
     prompt = f"""
 You are a professional data cleaning assistant.
@@ -129,7 +125,6 @@ Return ONLY a valid CSV text (no markdown, no explanations).
 Dataset:
 {csv_text}
 """
     try:
         cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
     except Exception as e:
@@ -145,32 +140,16 @@ Dataset:
         st.warning(f"⚠️ AI CSV parse failed: {e}")
         return fallback_clean(df)
 # ======================================================
 # 🧩 DATA ANALYSIS
 # ======================================================
-def summarize_dataframe(df: pd.DataFrame) -> str:
-    lines = [f"Rows: {len(df)} | Columns: {len(df.columns)}", "Column summaries:"]
-    for col in df.columns[:10]:
-        non_null = int(df[col].notnull().sum())
-        if pd.api.types.is_numeric_dtype(df[col]):
-            desc = df[col].describe().to_dict()
-            mean = float(desc.get("mean", np.nan))
-            median = float(df[col].median()) if non_null > 0 else None
-            lines.append(f"- {col}: mean={mean:.3f}, median={median}, non_null={non_null}")
-        else:
-            top = df[col].value_counts().head(3).to_dict()
-            lines.append(f"- {col}: top_values={top}, non_null={non_null}")
-    return "\n".join(lines)
 def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
     csv_text = df.to_csv(index=False)
     prompt = f"""
 You are a professional data analyst.
 Analyze the dataset '{dataset_name}' and answer the user's question.
---- FULL DATA SAMPLE ---
 {csv_text}
 --- USER QUESTION ---
@@ -184,19 +163,20 @@ Respond with:
 """
     try:
         if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
-            if gemini_client is None:
                 return "⚠️ Gemini API key missing."
-            response = gemini_client.models.generate_content(
                 model="gemini-2.5-flash",
-                contents=[prompt]
             )
-            return getattr(response, "text", "No response from Gemini.")
         else:
             return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
     except Exception as e:
         return f"⚠️ Analysis failed: {e}"
 # ======================================================
 # 🚀 MAIN APP LOGIC
 # ======================================================
@@ -212,51 +192,7 @@ if uploaded:
     st.dataframe(cleaned_df.head(), use_container_width=True)
     with st.expander("📋 Cleaning Summary", expanded=False):
-        st.text(summarize_dataframe(cleaned_df))
-    with st.expander("📈 Quick Visualizations", expanded=True):
-        numeric_cols = cleaned_df.select_dtypes(include="number").columns.tolist()
-        categorical_cols = cleaned_df.select_dtypes(exclude="number").columns.tolist()
-        viz_type = st.selectbox(
-            "Visualization Type",
-            ["Scatter Plot", "Histogram", "Box Plot", "Correlation Heatmap", "Categorical Count"]
-        )
-        if viz_type == "Scatter Plot" and len(numeric_cols) >= 2:
-            x = st.selectbox("X-axis", numeric_cols)
-            y = st.selectbox("Y-axis", numeric_cols, index=min(1, len(numeric_cols)-1))
-            color = st.selectbox("Color", ["None"] + categorical_cols)
-            fig = px.scatter(cleaned_df, x=x, y=y, color=None if color=="None" else color)
-            st.plotly_chart(fig, use_container_width=True)
-        elif viz_type == "Histogram" and numeric_cols:
-            col = st.selectbox("Column", numeric_cols)
-            fig = px.histogram(cleaned_df, x=col, nbins=30)
-            st.plotly_chart(fig, use_container_width=True)
-        elif viz_type == "Box Plot" and numeric_cols:
-            col = st.selectbox("Column", numeric_cols)
-            fig = px.box(cleaned_df, y=col)
-            st.plotly_chart(fig, use_container_width=True)
-        elif viz_type == "Correlation Heatmap" and len(numeric_cols) > 1:
-            corr = cleaned_df[numeric_cols].corr()
-            fig = ff.create_annotated_heatmap(
-                z=corr.values,
-                x=list(corr.columns),
-                y=list(corr.index),
-                annotation_text=corr.round(2).values,
-                showscale=True
-            )
-            st.plotly_chart(fig, use_container_width=True)
-        elif viz_type == "Categorical Count" and categorical_cols:
-            cat = st.selectbox("Category", categorical_cols)
-            fig = px.bar(cleaned_df[cat].value_counts().reset_index(), x="index", y=cat)
-            st.plotly_chart(fig, use_container_width=True)
-        else:
-            st.warning("⚠️ Not enough columns for this visualization type.")
     st.subheader("💬 Ask AI About Your Data")
     user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")

     login(token=HF_TOKEN)
 if GEMINI_API_KEY:
+    genai.api_key = GEMINI_API_KEY
 else:
     st.warning("⚠️ Gemini API key missing. Gemini 2.5 Flash will not work.")
 # ======================================================
     )
     temperature = st.slider("Temperature", 0.0, 1.0, 0.3)
+    max_tokens = st.slider("Max Tokens", 128, 4096, 1024)
+# Initialize HF clients
 hf_cleaner_client = InferenceClient(model=CLEANER_MODEL, token=HF_TOKEN)
 hf_analyst_client = None
 if ANALYST_MODEL != "Gemini 2.5 Flash (Google)":
 # 🧩 SAFE GENERATION FUNCTION
 # ======================================================
 def safe_hf_generate(client, prompt, temperature=0.3, max_tokens=512):
     try:
         resp = client.text_generation(
             prompt,
     df.drop_duplicates(inplace=True)
     return df
 def ai_clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
     csv_text = df.to_csv(index=False)
     prompt = f"""
 You are a professional data cleaning assistant.
 Dataset:
 {csv_text}
 """
     try:
         cleaned_str = safe_hf_generate(hf_cleaner_client, prompt, temperature=0.1, max_tokens=4096)
     except Exception as e:
         st.warning(f"⚠️ AI CSV parse failed: {e}")
         return fallback_clean(df)
 # ======================================================
 # 🧩 DATA ANALYSIS
 # ======================================================
 def query_analysis_model(df: pd.DataFrame, user_query: str, dataset_name: str) -> str:
     csv_text = df.to_csv(index=False)
     prompt = f"""
 You are a professional data analyst.
 Analyze the dataset '{dataset_name}' and answer the user's question.
+--- FULL DATA ---
 {csv_text}
 --- USER QUESTION ---
 """
     try:
         if ANALYST_MODEL == "Gemini 2.5 Flash (Google)":
+            if GEMINI_API_KEY is None:
                 return "⚠️ Gemini API key missing."
+            response = genai.generate_text(
                 model="gemini-2.5-flash",
+                prompt=prompt,
+                temperature=temperature,
+                max_output_tokens=max_tokens
             )
+            return getattr(response, "candidates", [{"content": "No response from Gemini."}])[0]["content"]
         else:
             return safe_hf_generate(hf_analyst_client, prompt, temperature=temperature, max_tokens=max_tokens)
     except Exception as e:
         return f"⚠️ Analysis failed: {e}"
 # ======================================================
 # 🚀 MAIN APP LOGIC
 # ======================================================
     st.dataframe(cleaned_df.head(), use_container_width=True)
     with st.expander("📋 Cleaning Summary", expanded=False):
+        st.text(f"Rows: {len(cleaned_df)} | Columns: {len(cleaned_df.columns)}")
     st.subheader("💬 Ask AI About Your Data")
     user_query = st.text_area("Enter your question:", placeholder="e.g. What factors influence sales the most?")