Spaces:

whackthejacker
/

DataHubHub

Paused

App Files Files Community

DataHubHub / components /dataset_visualization.py

whackthejacker

Upload 34 files

43b66f1 verified 8 months ago

raw

history blame

20.8 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots

	def render_dataset_visualization(dataset, dataset_type):
	"""
	Renders visualizations for the dataset.

	Args:
	dataset: The dataset to visualize (pandas DataFrame)
	dataset_type: The type of dataset (csv, json, etc.)
	"""
	if dataset is None:
	st.warning("No dataset to visualize.")
	return

	st.markdown("<h3>Dataset Visualization</h3>", unsafe_allow_html=True)

	# Get column types
	numeric_cols = dataset.select_dtypes(include=[np.number]).columns.tolist()
	categorical_cols = dataset.select_dtypes(include=['object', 'category']).columns.tolist()
	date_cols = [col for col in dataset.columns if dataset[col].dtype == 'datetime64[ns]']

	# Add visualization options based on column types
	viz_type = st.selectbox(
	"Select visualization type",
	["Distribution", "Correlation", "Categories", "Time Series", "Custom"],
	help="Choose the type of visualization to create"
	)

	if viz_type == "Distribution":
	if numeric_cols:
	# Select columns for distribution visualization
	selected_cols = st.multiselect(
	"Select columns to visualize",
	numeric_cols,
	default=numeric_cols[:min(3, len(numeric_cols))]
	)

	if not selected_cols:
	st.warning("Please select at least one column to visualize.")
	return

	# Distribution plots
	if len(selected_cols) == 1:
	# Single column histogram with density curve
	col = selected_cols[0]
	fig = px.histogram(
	dataset,
	x=col,
	histnorm='probability density',
	title=f"Distribution of {col}",
	color_discrete_sequence=["#FFD21E"],
	template="simple_white"
	)
	fig.add_traces(
	go.Scatter(
	x=dataset[col].sort_values(),
	y=dataset[col].sort_values().reset_index(drop=True).rolling(
	window=int(len(dataset[col])/10) if len(dataset[col]) > 10 else len(dataset[col]),
	min_periods=1,
	center=True
	).mean(),
	mode='lines',
	line=dict(color="#2563EB", width=3),
	name='Smoothed'
	)
	)
	st.plotly_chart(fig, use_container_width=True)
	else:
	# Multiple histograms in a grid
	num_cols = min(len(selected_cols), 2)
	num_rows = (len(selected_cols) + num_cols - 1) // num_cols

	fig = make_subplots(
	rows=num_rows,
	cols=num_cols,
	subplot_titles=[f"Distribution of {col}" for col in selected_cols]
	)

	for i, col in enumerate(selected_cols):
	row = i // num_cols + 1
	col_pos = i % num_cols + 1

	# Add histogram
	fig.add_trace(
	go.Histogram(
	x=dataset[col],
	name=col,
	marker_color="#FFD21E"
	),
	row=row, col=col_pos
	)

	fig.update_layout(
	title="Distribution of Selected Features",
	showlegend=False,
	template="simple_white",
	height=300 * num_rows
	)
	st.plotly_chart(fig, use_container_width=True)

	# Show distribution statistics
	st.markdown("### Distribution Statistics")
	stats_df = dataset[selected_cols].describe().T
	st.dataframe(stats_df, use_container_width=True)
	else:
	st.warning("No numeric columns found for distribution visualization.")

	elif viz_type == "Correlation":
	if len(numeric_cols) >= 2:
	# Correlation matrix
	st.markdown("### Correlation Matrix")

	# Select columns for correlation
	selected_cols = st.multiselect(
	"Select columns for correlation analysis",
	numeric_cols,
	default=numeric_cols[:min(5, len(numeric_cols))]
	)

	if len(selected_cols) < 2:
	st.warning("Please select at least two columns for correlation analysis.")
	return

	# Compute correlation
	corr = dataset[selected_cols].corr()

	# Heatmap
	fig = px.imshow(
	corr,
	color_continuous_scale="RdBu_r",
	title="Correlation Matrix",
	template="simple_white",
	text_auto=True
	)
	st.plotly_chart(fig, use_container_width=True)

	# Scatter plot matrix for selected columns
	if len(selected_cols) > 2 and len(selected_cols) <= 5: # Limit to 5 columns for readability
	st.markdown("### Scatter Plot Matrix")
	fig = px.scatter_matrix(
	dataset,
	dimensions=selected_cols,
	color_discrete_sequence=["#2563EB"],
	title="Scatter Plot Matrix",
	template="simple_white"
	)
	fig.update_traces(diagonal_visible=False)
	st.plotly_chart(fig, use_container_width=True)

	# Correlation pairs as bar chart
	st.markdown("### Top Correlation Pairs")

	# Get correlation pairs
	corr_pairs = []
	for i in range(len(corr.columns)):
	for j in range(i+1, len(corr.columns)):
	corr_pairs.append({
	'Feature 1': corr.columns[i],
	'Feature 2': corr.columns[j],
	'Correlation': corr.iloc[i, j]
	})

	# Sort by absolute correlation
	corr_pairs = sorted(corr_pairs, key=lambda x: abs(x['Correlation']), reverse=True)

	# Create bar chart
	if corr_pairs:
	# Convert to DataFrame
	corr_df = pd.DataFrame(corr_pairs)
	pair_labels = [f"{row['Feature 1']} & {row['Feature 2']}" for _, row in corr_df.iterrows()]

	# Bar chart
	fig = px.bar(
	x=pair_labels,
	y=[abs(c) for c in corr_df['Correlation']],
	color=corr_df['Correlation'],
	color_continuous_scale="RdBu_r",
	labels={'x': 'Feature Pairs', 'y': 'Absolute Correlation'},
	title="Top Feature Correlations"
	)
	st.plotly_chart(fig, use_container_width=True)
	else:
	st.warning("Need at least two numeric columns for correlation analysis.")

	elif viz_type == "Categories":
	if categorical_cols:
	# Select categorical column
	selected_cat = st.selectbox("Select categorical column", categorical_cols)

	# Category counts
	value_counts = dataset[selected_cat].value_counts()

	# Limit to top N categories if there are too many
	if len(value_counts) > 20:
	st.info(f"Showing top 20 categories out of {len(value_counts)}")
	value_counts = value_counts.head(20)

	# Bar chart
	fig = px.bar(
	x=value_counts.index,
	y=value_counts.values,
	title=f"Category Counts for {selected_cat}",
	labels={'x': selected_cat, 'y': 'Count'},
	color_discrete_sequence=["#FFD21E"]
	)
	st.plotly_chart(fig, use_container_width=True)

	# If there are numeric columns, show relationship with categorical
	if numeric_cols:
	st.markdown(f"### {selected_cat} vs Numeric Features")
	selected_num = st.selectbox("Select numeric column", numeric_cols)

	# Box plot
	fig = px.box(
	dataset,
	x=selected_cat,
	y=selected_num,
	title=f"{selected_cat} vs {selected_num}",
	color_discrete_sequence=["#2563EB"],
	template="simple_white"
	)
	st.plotly_chart(fig, use_container_width=True)

	# Statistics by category
	st.markdown(f"### Statistics of {selected_num} by {selected_cat}")
	stats_by_cat = dataset.groupby(selected_cat)[selected_num].describe()
	st.dataframe(stats_by_cat, use_container_width=True)
	else:
	st.warning("No categorical columns found for category visualization.")

	elif viz_type == "Time Series":
	# Check if there are potential date columns
	potential_date_cols = date_cols.copy()

	# Also check for object columns that might be dates
	for col in categorical_cols:
	# Sample the column to check if it contains date-like strings
	sample = dataset[col].dropna().head(5).tolist()
	if sample and all('/' in str(x) or '-' in str(x) for x in sample):
	potential_date_cols.append(col)

	if potential_date_cols:
	date_col = st.selectbox("Select date column", potential_date_cols)

	# Convert to datetime if it's not already
	if dataset[date_col].dtype != 'datetime64[ns]':
	try:
	temp_df = dataset.copy()
	temp_df[date_col] = pd.to_datetime(temp_df[date_col])
	except:
	st.error(f"Could not convert {date_col} to datetime.")
	return
	else:
	temp_df = dataset.copy()

	# Select numeric column for time series
	if numeric_cols:
	value_col = st.selectbox("Select value column", numeric_cols)

	# Aggregate by time period
	time_period = st.selectbox(
	"Aggregate by",
	["Day", "Week", "Month", "Quarter", "Year"]
	)

	# Set up time grouping
	if time_period == "Day":
	temp_df['period'] = temp_df[date_col].dt.date
	elif time_period == "Week":
	temp_df['period'] = temp_df[date_col].dt.to_period('W').dt.start_time
	elif time_period == "Month":
	temp_df['period'] = temp_df[date_col].dt.to_period('M').dt.start_time
	elif time_period == "Quarter":
	temp_df['period'] = temp_df[date_col].dt.to_period('Q').dt.start_time
	else: # Year
	temp_df['period'] = temp_df[date_col].dt.year

	# Aggregate data
	agg_method = st.selectbox("Aggregation method", ["Mean", "Sum", "Min", "Max", "Count"])
	agg_map = {
	"Mean": "mean",
	"Sum": "sum",
	"Min": "min",
	"Max": "max",
	"Count": "count"
	}

	time_series = temp_df.groupby('period')[value_col].agg(agg_map[agg_method]).reset_index()

	# Line chart
	fig = px.line(
	time_series,
	x='period',
	y=value_col,
	title=f"{agg_method} of {value_col} by {time_period}",
	markers=True,
	color_discrete_sequence=["#2563EB"],
	template="simple_white"
	)
	fig.update_layout(
	xaxis_title=time_period,
	yaxis_title=f"{agg_method} of {value_col}"
	)
	st.plotly_chart(fig, use_container_width=True)

	# Show trendline option
	if st.checkbox("Show trendline"):
	fig = px.scatter(
	time_series,
	x='period',
	y=value_col,
	trendline="ols",
	title=f"{agg_method} of {value_col} by {time_period} with Trendline",
	color_discrete_sequence=["#2563EB"],
	template="simple_white"
	)
	fig.update_layout(
	xaxis_title=time_period,
	yaxis_title=f"{agg_method} of {value_col}"
	)
	st.plotly_chart(fig, use_container_width=True)

	# Table view of time series data
	st.dataframe(time_series, use_container_width=True)
	else:
	st.warning("No numeric columns found for time series values.")
	else:
	st.warning("No date columns found for time series visualization.")

	elif viz_type == "Custom":
	st.markdown("### Custom Visualization")
	st.info("Create a custom plot by selecting axes and plot type")

	# Select plot type
	plot_type = st.selectbox(
	"Select plot type",
	["Scatter", "Line", "Bar", "Box", "Violin", "Histogram", "Pie", "3D Scatter"]
	)

	# Depending on the plot type, get required axes
	if plot_type in ["Scatter", "Line", "Bar", "3D Scatter"]:
	# For scatter/line/bar, we need x and y
	x_col = st.selectbox("X-axis", dataset.columns.tolist())
	y_col = st.selectbox("Y-axis", numeric_cols if numeric_cols else dataset.columns.tolist())

	# For 3D scatter, we need a z-axis
	if plot_type == "3D Scatter":
	z_col = st.selectbox("Z-axis", numeric_cols if numeric_cols else dataset.columns.tolist())

	# Optional color dimension
	use_color = st.checkbox("Add color dimension")
	color_col = None
	if use_color:
	color_col = st.selectbox("Color by", dataset.columns.tolist())

	# Create plot
	if plot_type == "Scatter":
	fig = px.scatter(
	dataset,
	x=x_col,
	y=y_col,
	color=color_col,
	title=f"{y_col} vs {x_col}",
	template="simple_white"
	)
	elif plot_type == "Line":
	fig = px.line(
	dataset.sort_values(x_col),
	x=x_col,
	y=y_col,
	color=color_col,
	title=f"{y_col} vs {x_col}",
	template="simple_white"
	)
	elif plot_type == "Bar":
	fig = px.bar(
	dataset,
	x=x_col,
	y=y_col,
	color=color_col,
	title=f"{y_col} by {x_col}",
	template="simple_white"
	)
	elif plot_type == "3D Scatter":
	fig = px.scatter_3d(
	dataset,
	x=x_col,
	y=y_col,
	z=z_col,
	color=color_col,
	title=f"3D Scatter: {x_col}, {y_col}, {z_col}",
	template="simple_white"
	)

	st.plotly_chart(fig, use_container_width=True)

	elif plot_type in ["Box", "Violin"]:
	# For box/violin, we need x (categorical) and y (numeric)
	x_col = st.selectbox("X-axis (categories)", categorical_cols if categorical_cols else dataset.columns.tolist())
	y_col = st.selectbox("Y-axis (values)", numeric_cols if numeric_cols else dataset.columns.tolist())

	# Optional color dimension
	use_color = st.checkbox("Add color dimension")
	color_col = None
	if use_color:
	color_col = st.selectbox("Color by", dataset.columns.tolist())

	# Create plot
	if plot_type == "Box":
	fig = px.box(
	dataset,
	x=x_col,
	y=y_col,
	color=color_col,
	title=f"Box Plot: {y_col} by {x_col}",
	template="simple_white"
	)
	else: # Violin
	fig = px.violin(
	dataset,
	x=x_col,
	y=y_col,
	color=color_col,
	title=f"Violin Plot: {y_col} by {x_col}",
	template="simple_white"
	)

	st.plotly_chart(fig, use_container_width=True)

	elif plot_type == "Histogram":
	# For histogram, we need just one column
	value_col = st.selectbox("Value column", dataset.columns.tolist())

	# Bins option
	n_bins = st.slider("Number of bins", 5, 100, 20)

	# Optional color dimension
	use_color = st.checkbox("Add color dimension")
	color_col = None
	if use_color:
	color_col = st.selectbox("Color by", dataset.columns.tolist())

	# Create plot
	fig = px.histogram(
	dataset,
	x=value_col,
	color=color_col,
	nbins=n_bins,
	title=f"Histogram of {value_col}",
	template="simple_white"
	)

	st.plotly_chart(fig, use_container_width=True)

	elif plot_type == "Pie":
	# For pie, we need a categorical column
	cat_col = st.selectbox("Category column", categorical_cols if categorical_cols else dataset.columns.tolist())

	# Optional value column
	use_values = st.checkbox("Use custom values")
	value_col = None
	if use_values and numeric_cols:
	value_col = st.selectbox("Value column", numeric_cols)

	# Limit to top N categories if there are too many
	top_n = st.slider("Limit to top N categories", 0, 20, 10,
	help="Set to 0 to show all categories. Recommended to limit to top 10-15 categories for readability.")

	# Process data for pie chart
	if top_n > 0:
	if use_values and value_col:
	pie_data = dataset.groupby(cat_col)[value_col].sum().reset_index()
	pie_data = pie_data.sort_values(value_col, ascending=False).head(top_n)
	else:
	value_counts = dataset[cat_col].value_counts().reset_index()
	value_counts.columns = [cat_col, 'count']
	pie_data = value_counts.head(top_n)
	value_col = 'count'
	else:
	if use_values and value_col:
	pie_data = dataset.groupby(cat_col)[value_col].sum().reset_index()
	else:
	value_counts = dataset[cat_col].value_counts().reset_index()
	value_counts.columns = [cat_col, 'count']
	pie_data = value_counts
	value_col = 'count'

	# Create plot
	fig = px.pie(
	pie_data,
	names=cat_col,
	values=value_col,
	title=f"Pie Chart of {cat_col}",
	template="simple_white"
	)

	st.plotly_chart(fig, use_container_width=True)