Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from sklearn.decomposition import PCA | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.pipeline import make_pipeline | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.decomposition import PCA | |
| from sklearn.cross_decomposition import PLSRegression | |
| #Data preparation | |
| def make_data(): | |
| rng = np.random.RandomState(0) | |
| n_samples = 500 | |
| cov = [[3, 3], [3, 4]] | |
| X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples) | |
| return X,rng,n_samples | |
| def plot_scatter_pca(alpha): | |
| plt.scatter(X[:, 0], X[:, 1], alpha=alpha, label="samples") | |
| for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)): | |
| comp = comp * var # scale component by its variance explanation power | |
| plt.plot( | |
| [0, comp[0]], | |
| [0, comp[1]], | |
| label=f"Component {i}", | |
| linewidth=5, | |
| color=f"C{i + 2}", | |
| ) | |
| plt.gca().set( | |
| aspect="equal", | |
| title="2-dimensional dataset with principal components", | |
| xlabel="first feature", | |
| ylabel="second feature", | |
| ) | |
| plt.legend() | |
| # plt.show() | |
| return plt | |
| def datagen_y(): | |
| y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2 | |
| return y | |
| def data_projections(): | |
| y = datagen_y() | |
| fig, axes = plt.subplots(1, 2, figsize=(10, 3)) | |
| axes[0].scatter(X.dot(pca.components_[0]), y, alpha=0.3) | |
| axes[0].set(xlabel="Projected data onto first PCA component", ylabel="y") | |
| axes[1].scatter(X.dot(pca.components_[1]), y, alpha=0.3) | |
| axes[1].set(xlabel="Projected data onto second PCA component", ylabel="y") | |
| plt.tight_layout() | |
| # plt.show() | |
| return plt | |
| def plot_pca_ls(): | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) | |
| pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression()) | |
| pcr.fit(X_train, y_train) | |
| pca = pcr.named_steps["pca"] # retrieve the PCA step of the pipeline | |
| pls = PLSRegression(n_components=1) | |
| pls.fit(X_train, y_train) | |
| fig, axes = plt.subplots(1, 2, figsize=(10, 3)) | |
| axes[0].scatter(pca.transform(X_test), y_test, alpha=0.3, label="ground truth") | |
| axes[0].scatter( | |
| pca.transform(X_test), pcr.predict(X_test), alpha=0.3, label="predictions" | |
| ) | |
| axes[0].set( | |
| xlabel="Projected data onto first PCA component", ylabel="y", title="PCR / PCA" | |
| ) | |
| axes[0].legend() | |
| axes[1].scatter(pls.transform(X_test), y_test, alpha=0.3, label="ground truth") | |
| axes[1].scatter( | |
| pls.transform(X_test), pls.predict(X_test), alpha=0.3, label="predictions" | |
| ) | |
| axes[1].set(xlabel="Projected data onto first PLS component", ylabel="y", title="PLS") | |
| axes[1].legend() | |
| plt.tight_layout() | |
| # plt.show() | |
| return plt | |
| def get_components(): | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) | |
| pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression()) | |
| pls = PLSRegression(n_components=1) | |
| return X_train, X_test, y_train, y_test, pcr, pls | |
| def print_results(): | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) | |
| pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression()) | |
| pcr.fit(X_train, y_train) | |
| pca = pcr.named_steps["pca"] # retrieve the PCA step of the pipeline | |
| pls = PLSRegression(n_components=1) | |
| pls.fit(X_train, y_train) | |
| result1 = f"PCR r-squared {pcr.score(X_test, y_test):.3f}" | |
| result2 = f"PLS r-squared {pls.score(X_test, y_test):.3f}" | |
| mystr = result1 +"\n"+ result2 | |
| return mystr | |
| def calc_pcr_r2(): | |
| X_train, X_test, y_train, y_test, pcr, pls = get_components() | |
| pca_2 = make_pipeline(PCA(n_components=2), LinearRegression()) | |
| pca_2.fit(X_train, y_train) | |
| r2 = f"PCR r-squared with 2 components {pca_2.score(X_test, y_test):.3f}" | |
| return r2 | |
| X, rng, n_samples = make_data() | |
| pca = PCA(n_components=2).fit(X) | |
| y = datagen_y() | |
| # plot_scatter_pca(alpha) | |
| title = " Principal Component Regression vs Partial Least Squares Regression." | |
| with gr.Blocks(title=title, theme='gstaff/xkcd') as demo: | |
| gr.Markdown(f" # {title}") | |
| gr.Markdown( | |
| """ | |
| This example compares Principal Component Regression (PCR) and Partial Least Squares Regression (PLS) on a toy dataset. | |
| Our goal is to illustrate how PLS can outperform PCR when the target is strongly correlated with some directions in the | |
| data that have a low variance. | |
| PCR is a regressor composed of two steps: first, PCA is applied to the training data, possibly performing dimensionality reduction; | |
| then, a regressor (e.g. a linear regressor) is trained on the transformed samples. | |
| In PCA, the transformation is purely unsupervised, meaning that no information about the targets is used. | |
| As a result, PCR may perform poorly in some datasets where the target is strongly correlated with directions that have low variance. | |
| Indeed, the dimensionality reduction of PCA projects the data into a lower dimensional space where the variance of the projected data | |
| is greedily maximized along each axis. Despite them having the most predictive power on the target, | |
| the directions with a lower variance will be dropped, and the final regressor will not be able to leverage them. | |
| PLS is both a transformer and a regressor, and it is quite similar to PCR: | |
| it also applies a dimensionality reduction to the samples before applying a linear regressor to the transformed data. | |
| The main difference with PCR is that the PLS transformation is supervised. Therefore, as we will see in this example, | |
| it does not suffer from the issue we just mentioned. | |
| """) | |
| gr.Markdown("You can see the associated scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/cross_decomposition/plot_pcr_vs_pls.html#sphx-glr-auto-examples-cross-decomposition-plot-pcr-vs-pls-py).") | |
| # loaded_model = load_hf_model_hub() | |
| with gr.Tab("Visualize Input dataset"): | |
| with gr.Row(equal_height=True): | |
| slider1 = gr.Slider(label="alpha", minimum=0.0, maximum=1.0) | |
| slider1.change(plot_scatter_pca, slider1, outputs= gr.Plot(label='Visualizing input dataset') ) | |
| with gr.Tab("PCA data projections"): | |
| btn_decision = gr.Button(value="PCA data projections") | |
| btn_decision.click(data_projections, outputs= gr.Plot(label='PCA data projections') ) | |
| with gr.Tab("predictive power"): | |
| btn_power = gr.Button(value="Predictive power") | |
| btn_power.click(plot_pca_ls, outputs= gr.Plot(label='Predictive power') ) | |
| with gr.Tab("Results tab"): | |
| gr.Markdown( | |
| """ | |
| As a final remark, | |
| we note that PCR with 2 components performs as well as PLS: this is because in this case, | |
| PCR was able to leverage the second component which has the most preditive power on the target. | |
| """) | |
| btn_power = gr.Button(value="Results") | |
| out = gr.Textbox(label="r2 score of both estimators") | |
| btn_power.click(print_results, outputs= out ) | |
| with gr.Tab("r2_score of predictors comparison"): | |
| with gr.Row(equal_height=True): | |
| gr.Markdown( | |
| """ | |
| We also print the R-squared scores of both estimators, which further confirms that PLS is a better alternative than PCR in this case. | |
| A negative R-squared indicates that PCR performs worse than a regressor that would simply predict the mean of the target. | |
| """) | |
| btn_1 = gr.Button(value="r2_score of predictors") | |
| out1 = gr.Textbox(label="r2_score of predictors") | |
| btn_1.click(calc_pcr_r2, outputs= out1 ) | |
| gr.Markdown( f"## End of page") | |
| demo.launch() |