Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,9 +12,8 @@ import tempfile
|
|
| 12 |
# Suppress specific runtime warnings
|
| 13 |
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
| 14 |
|
| 15 |
-
|
| 16 |
# Function to check distribution type
|
| 17 |
-
def check_distribution(target_column):
|
| 18 |
data = target_column.dropna()
|
| 19 |
|
| 20 |
# Distribution dictionaries
|
|
@@ -69,10 +68,11 @@ def check_distribution(target_column):
|
|
| 69 |
# Create a single plot for all distributions
|
| 70 |
plt.figure(figsize=(12, 8), dpi=400)
|
| 71 |
|
| 72 |
-
# Plot the original data distribution as a histogram
|
| 73 |
-
|
|
|
|
| 74 |
|
| 75 |
-
#
|
| 76 |
sns.kdeplot(data, color=actual_data_color, lw=2, label='Actual Data Distribution Line')
|
| 77 |
|
| 78 |
# Overlay the top 3 best fit distributions
|
|
@@ -110,7 +110,8 @@ def check_distribution(target_column):
|
|
| 110 |
p_value_text = "<0.001" if normal_p_value < 0.001 else f"{normal_p_value:.5f}"
|
| 111 |
|
| 112 |
plt.figure(figsize=(12, 8), dpi=400)
|
| 113 |
-
|
|
|
|
| 114 |
sns.kdeplot(data, color=actual_data_color, lw=2, label='Actual Data Distribution Line')
|
| 115 |
plt.plot(normal_best_fit_data, normal_pdf, color=normal_color, lw=2, label=f'Normal Fit (p-value={p_value_text})')
|
| 116 |
plt.title("Comparison with Normal Distribution")
|
|
@@ -125,26 +126,24 @@ def check_distribution(target_column):
|
|
| 125 |
|
| 126 |
return result_text, best_fit_plot, normal_comparison_plot
|
| 127 |
|
| 128 |
-
|
| 129 |
# Function to load the CSV file and extract numeric column names
|
| 130 |
def load_file(file):
|
| 131 |
df = pd.read_csv(file.name)
|
| 132 |
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 133 |
return gr.update(choices=numeric_columns), df
|
| 134 |
|
| 135 |
-
|
| 136 |
# Function to analyze the selected column
|
| 137 |
-
def analyze_column(selected_column, df):
|
| 138 |
-
result_text, best_fit_plot, normal_comparison_plot = check_distribution(df[selected_column])
|
| 139 |
return result_text, best_fit_plot, normal_comparison_plot
|
| 140 |
|
| 141 |
-
|
| 142 |
# Define the Gradio app layout
|
| 143 |
with gr.Blocks() as demo:
|
| 144 |
gr.Markdown("# Data Distribution Fit\n")
|
| 145 |
|
| 146 |
file_input = gr.File(label="Upload CSV File")
|
| 147 |
column_selector = gr.Dropdown(label="Select Target Column", choices=[])
|
|
|
|
| 148 |
analyze_button = gr.Button("Fit")
|
| 149 |
output_text = gr.Textbox(label="Results")
|
| 150 |
best_fit_plot_output = gr.Image(label="Best Fit Distributions")
|
|
@@ -157,7 +156,7 @@ with gr.Blocks() as demo:
|
|
| 157 |
file_input.upload(load_file, inputs=file_input, outputs=[column_selector, df_state])
|
| 158 |
|
| 159 |
# Perform analysis on the selected column
|
| 160 |
-
analyze_button.click(analyze_column, inputs=[column_selector, df_state],
|
| 161 |
outputs=[output_text, best_fit_plot_output, normal_comparison_output])
|
| 162 |
|
| 163 |
-
demo.launch()
|
|
|
|
| 12 |
# Suppress specific runtime warnings
|
| 13 |
warnings.filterwarnings("ignore", category=RuntimeWarning)
|
| 14 |
|
|
|
|
| 15 |
# Function to check distribution type
|
| 16 |
+
def check_distribution(target_column, show_histogram):
|
| 17 |
data = target_column.dropna()
|
| 18 |
|
| 19 |
# Distribution dictionaries
|
|
|
|
| 68 |
# Create a single plot for all distributions
|
| 69 |
plt.figure(figsize=(12, 8), dpi=400)
|
| 70 |
|
| 71 |
+
# Plot the original data distribution as a histogram if show_histogram is True
|
| 72 |
+
if show_histogram:
|
| 73 |
+
sns.histplot(data, kde=False, stat="density", bins=50, color=actual_data_color, label='Actual Data Distribution')
|
| 74 |
|
| 75 |
+
# Always plot the KDE line
|
| 76 |
sns.kdeplot(data, color=actual_data_color, lw=2, label='Actual Data Distribution Line')
|
| 77 |
|
| 78 |
# Overlay the top 3 best fit distributions
|
|
|
|
| 110 |
p_value_text = "<0.001" if normal_p_value < 0.001 else f"{normal_p_value:.5f}"
|
| 111 |
|
| 112 |
plt.figure(figsize=(12, 8), dpi=400)
|
| 113 |
+
if show_histogram:
|
| 114 |
+
sns.histplot(data, kde=False, stat="density", bins=50, color=actual_data_color, label='Actual Data Distribution')
|
| 115 |
sns.kdeplot(data, color=actual_data_color, lw=2, label='Actual Data Distribution Line')
|
| 116 |
plt.plot(normal_best_fit_data, normal_pdf, color=normal_color, lw=2, label=f'Normal Fit (p-value={p_value_text})')
|
| 117 |
plt.title("Comparison with Normal Distribution")
|
|
|
|
| 126 |
|
| 127 |
return result_text, best_fit_plot, normal_comparison_plot
|
| 128 |
|
|
|
|
| 129 |
# Function to load the CSV file and extract numeric column names
|
| 130 |
def load_file(file):
|
| 131 |
df = pd.read_csv(file.name)
|
| 132 |
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 133 |
return gr.update(choices=numeric_columns), df
|
| 134 |
|
|
|
|
| 135 |
# Function to analyze the selected column
|
| 136 |
+
def analyze_column(selected_column, df, show_histogram):
|
| 137 |
+
result_text, best_fit_plot, normal_comparison_plot = check_distribution(df[selected_column], show_histogram)
|
| 138 |
return result_text, best_fit_plot, normal_comparison_plot
|
| 139 |
|
|
|
|
| 140 |
# Define the Gradio app layout
|
| 141 |
with gr.Blocks() as demo:
|
| 142 |
gr.Markdown("# Data Distribution Fit\n")
|
| 143 |
|
| 144 |
file_input = gr.File(label="Upload CSV File")
|
| 145 |
column_selector = gr.Dropdown(label="Select Target Column", choices=[])
|
| 146 |
+
show_histogram = gr.Checkbox(label="Show Histogram", value=True)
|
| 147 |
analyze_button = gr.Button("Fit")
|
| 148 |
output_text = gr.Textbox(label="Results")
|
| 149 |
best_fit_plot_output = gr.Image(label="Best Fit Distributions")
|
|
|
|
| 156 |
file_input.upload(load_file, inputs=file_input, outputs=[column_selector, df_state])
|
| 157 |
|
| 158 |
# Perform analysis on the selected column
|
| 159 |
+
analyze_button.click(analyze_column, inputs=[column_selector, df_state, show_histogram],
|
| 160 |
outputs=[output_text, best_fit_plot_output, normal_comparison_output])
|
| 161 |
|
| 162 |
+
demo.launch()
|