Spaces:

reab5555
/

Data-Distribution-Fit

Sleeping

App Files Files Community

Data-Distribution-Fit / app.py

reab5555

Update app.py

458422b verified about 1 year ago

raw

history blame contribute delete

6.42 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from scipy.stats import kstest, norm, expon, lognorm, chi2, beta, gamma, uniform, pareto, cauchy, t, \
	weibull_min, laplace, logistic, burr, invgamma, invgauss, gompertz, triang, loglaplace, levy, gumbel_r, gumbel_l, \
	rayleigh, powerlaw
	import warnings
	import tempfile

	# Suppress specific runtime warnings
	warnings.filterwarnings("ignore", category=RuntimeWarning)

	# Function to check distribution type
	def check_distribution(target_column, show_histogram):
	data = target_column.dropna()

	# Distribution dictionaries
	distributions = {
	'Normal': norm,
	'Exponential': expon,
	'Lognormal': lognorm,
	'Chi-square': chi2,
	'Beta': beta,
	'Gamma': gamma,
	'Uniform': uniform,
	'Pareto': pareto,
	'Cauchy': cauchy,
	'Student\'s t': t,
	'Weibull': weibull_min,
	'Laplace': laplace,
	'Logistic': logistic,
	'Burr': burr,
	'Inverse Gamma': invgamma,
	'Inverse Gaussian': invgauss,
	'Gompertz': gompertz,
	'Triangular': triang,
	'Log-Laplace': loglaplace,
	'Levy': levy,
	'Gumbel Right': gumbel_r,
	'Gumbel Left': gumbel_l,
	'Rayleigh': rayleigh,
	'Powerlaw': powerlaw
	}

	# Colors for top 3 distributions and the normal distribution
	distribution_colors = ['gold', 'silver', 'brown']
	normal_color = 'red'
	actual_data_color = 'black'

	# List to store results
	results = []

	# Test each distribution with error handling
	for name, distribution in distributions.items():
	try:
	params = distribution.fit(data)
	D, p_value = kstest(data, distribution.cdf, args=params)
	results.append((name, p_value, params))
	except Exception as e:
	print(f"Skipping {name} distribution due to an error: {e}")

	# Sort by p-value and get top 3
	results.sort(key=lambda x: x[1], reverse=True)
	top_3_results = results[:3]

	# Create a single plot for all distributions
	plt.figure(figsize=(12, 8), dpi=400)

	# Plot the original data distribution as a histogram if show_histogram is True
	if show_histogram:
	sns.histplot(data, kde=False, stat="density", bins=50, color=actual_data_color, label='Actual Data Distribution')

	# Always plot the KDE line
	sns.kdeplot(data, color=actual_data_color, lw=2, label='Actual Data Distribution Line')

	# Overlay the top 3 best fit distributions
	for i, (name, p_value, params) in enumerate(top_3_results):
	best_fit_data = np.linspace(min(data), max(data), 1000)
	pdf = distributions[name].pdf(best_fit_data, *params)
	p_value_text = "<0.001" if p_value < 0.001 else f"{p_value:.5f}"
	plt.plot(best_fit_data, pdf, color=distribution_colors[i], lw=2, label=f'{name} Fit (p-value={p_value_text})')

	plt.title("Top 3 Best Fit Distributions Overlaid")
	plt.legend()

	# Save plot to temporary file
	temp_file_fit = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
	plt.savefig(temp_file_fit.name)
	plt.close()

	best_fit_plot = temp_file_fit.name

	# Prepare result text with top 3 distributions and p-values
	result_text = "Top 3 best matched distributions:\n"
	for i, (name, p_value, _) in enumerate(top_3_results):
	p_value_text = "<0.001" if p_value < 0.001 else f"{p_value:.5f}"
	result_text += f"Top {i + 1}: {name} with a p-value of {p_value_text}\n"

	# Add disclaimer about p-value significance
	result_text += "\nKolmogorov-Smirnov Test - A significant p-value (below 0.05) indicates that the distribution does not conform well to the actual data distribution."

	# Normal distribution comparison
	mean, std = norm.fit(data)
	normal_best_fit_data = np.linspace(min(data), max(data), 1000)
	normal_pdf = norm.pdf(normal_best_fit_data, mean, std)
	ks_stat, normal_p_value = kstest(data, 'norm', args=(mean, std))

	p_value_text = "<0.001" if normal_p_value < 0.001 else f"{normal_p_value:.5f}"

	plt.figure(figsize=(12, 8), dpi=400)
	if show_histogram:
	sns.histplot(data, kde=False, stat="density", bins=50, color=actual_data_color, label='Actual Data Distribution')
	sns.kdeplot(data, color=actual_data_color, lw=2, label='Actual Data Distribution Line')
	plt.plot(normal_best_fit_data, normal_pdf, color=normal_color, lw=2, label=f'Normal Fit (p-value={p_value_text})')
	plt.title("Comparison with Normal Distribution (Kolmogorov-Smirnov)")
	plt.legend()

	# Save plot to temporary file
	temp_file_normal = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
	plt.savefig(temp_file_normal.name)
	plt.close()

	normal_comparison_plot = temp_file_normal.name

	return result_text, best_fit_plot, normal_comparison_plot

	# Function to load the CSV file and extract numeric column names
	def load_file(file):
	df = pd.read_csv(file.name)
	numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
	return gr.update(choices=numeric_columns), df

	# Function to analyze the selected column
	def analyze_column(selected_column, df, show_histogram):
	result_text, best_fit_plot, normal_comparison_plot = check_distribution(df[selected_column], show_histogram)
	return result_text, best_fit_plot, normal_comparison_plot

	# Define the Gradio app layout
	with gr.Blocks() as demo:
	gr.Markdown("# Data Distribution Fit\n")

	file_input = gr.File(label="Upload CSV File")
	column_selector = gr.Dropdown(label="Select Target Column", choices=[])
	show_histogram = gr.Checkbox(label="Show Bars", value=True)
	analyze_button = gr.Button("Fit")
	output_text = gr.Textbox(label="Results")
	best_fit_plot_output = gr.Image(label="Best Fit Distributions")
	normal_comparison_output = gr.Image(label="Comparison with Normal Distribution")

	# State management
	df_state = gr.State(None)

	# Load the file and populate the dropdown
	file_input.upload(load_file, inputs=file_input, outputs=[column_selector, df_state])

	# Perform analysis on the selected column
	analyze_button.click(analyze_column, inputs=[column_selector, df_state, show_histogram],
	outputs=[output_text, best_fit_plot_output, normal_comparison_output])

	demo.launch()