Spaces:

sklearn-docs
/

pcr_vs_pls_regression

Runtime error

App Files Files Community

Jayabalambika commited on May 4, 2023

Commit

5fa63b7

1 Parent(s): 66539e8

first commit

Browse files

Files changed (1) hide show

app.py +216 -0

app.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import gradio as gr
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+from sklearn.linear_model import LinearRegression
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
+from sklearn.cross_decomposition import PLSRegression
+#Data preparation
+def make_data():
+  rng = np.random.RandomState(0)
+  n_samples = 500
+  cov = [[3, 3], [3, 4]]
+  X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)
+  return X,rng,n_samples
+def plot_scatter_pca(alpha):
+  plt.scatter(X[:, 0], X[:, 1], alpha=alpha, label="samples")
+  for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)):
+      comp = comp * var  # scale component by its variance explanation power
+      plt.plot(
+          [0, comp[0]],
+          [0, comp[1]],
+          label=f"Component {i}",
+          linewidth=5,
+          color=f"C{i + 2}",
+      )
+  plt.gca().set(
+      aspect="equal",
+      title="2-dimensional dataset with principal components",
+      xlabel="first feature",
+      ylabel="second feature",
+  )
+  plt.legend()
+  # plt.show()
+  return plt
+def datagen_y():
+  y = X.dot(pca.components_[1]) + rng.normal(size=n_samples) / 2
+  return y
+def data_projections():
+  y = datagen_y()
+  fig, axes = plt.subplots(1, 2, figsize=(10, 3))
+  axes[0].scatter(X.dot(pca.components_[0]), y, alpha=0.3)
+  axes[0].set(xlabel="Projected data onto first PCA component", ylabel="y")
+  axes[1].scatter(X.dot(pca.components_[1]), y, alpha=0.3)
+  axes[1].set(xlabel="Projected data onto second PCA component", ylabel="y")
+  plt.tight_layout()
+  # plt.show()
+  return plt
+def plot_pca_ls():
+  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+  pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
+  pcr.fit(X_train, y_train)
+  pca = pcr.named_steps["pca"]  # retrieve the PCA step of the pipeline
+  pls = PLSRegression(n_components=1)
+  pls.fit(X_train, y_train)
+  fig, axes = plt.subplots(1, 2, figsize=(10, 3))
+  axes[0].scatter(pca.transform(X_test), y_test, alpha=0.3, label="ground truth")
+  axes[0].scatter(
+      pca.transform(X_test), pcr.predict(X_test), alpha=0.3, label="predictions"
+  )
+  axes[0].set(
+      xlabel="Projected data onto first PCA component", ylabel="y", title="PCR / PCA"
+  )
+  axes[0].legend()
+  axes[1].scatter(pls.transform(X_test), y_test, alpha=0.3, label="ground truth")
+  axes[1].scatter(
+      pls.transform(X_test), pls.predict(X_test), alpha=0.3, label="predictions"
+  )
+  axes[1].set(xlabel="Projected data onto first PLS component", ylabel="y", title="PLS")
+  axes[1].legend()
+  plt.tight_layout()
+  # plt.show()
+  return plt
+def get_components():
+  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+  pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
+  pls = PLSRegression(n_components=1)
+  return X_train, X_test, y_train, y_test, pcr, pls
+def print_results():
+  X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
+  pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
+  pcr.fit(X_train, y_train)
+  pca = pcr.named_steps["pca"]  # retrieve the PCA step of the pipeline
+  pls = PLSRegression(n_components=1)
+  pls.fit(X_train, y_train)
+  result1 = f"PCR r-squared {pcr.score(X_test, y_test):.3f}"
+  result2 = f"PLS r-squared {pls.score(X_test, y_test):.3f}"
+  mystr = result1 +"\n"+ result2
+  return mystr
+def calc_pcr_r2():
+  X_train, X_test, y_train, y_test, pcr, pls = get_components()
+  pca_2 = make_pipeline(PCA(n_components=2), LinearRegression())
+  pca_2.fit(X_train, y_train)
+  r2 = f"PCR r-squared with 2 components {pca_2.score(X_test, y_test):.3f}"
+  return r2
+X, rng, n_samples = make_data()
+pca = PCA(n_components=2).fit(X)
+y = datagen_y()
+# plot_scatter_pca(alpha)
+title = " Principal Component Regression vs Partial Least Squares Regression."
+with gr.Blocks(title=title, theme='gstaff/xkcd') as demo:
+    gr.Markdown(f" # {title}")
+    gr.Markdown(
+    """
+    This example compares Principal Component Regression (PCR) and Partial Least Squares Regression (PLS) on a toy dataset.
+    Our goal is to illustrate how PLS can outperform PCR when the target is strongly correlated with some directions in the
+    data that have a low variance.
+    PCR is a regressor composed of two steps: first, PCA is applied to the training data, possibly performing dimensionality reduction;
+    then, a regressor (e.g. a linear regressor) is trained on the transformed samples.
+    In PCA, the transformation is purely unsupervised, meaning that no information about the targets is used.
+    As a result, PCR may perform poorly in some datasets where the target is strongly correlated with directions that have low variance.
+    Indeed, the dimensionality reduction of PCA projects the data into a lower dimensional space where the variance of the projected data
+    is greedily maximized along each axis. Despite them having the most predictive power on the target,
+    the directions with a lower variance will be dropped, and the final regressor will not be able to leverage them.
+    PLS is both a transformer and a regressor, and it is quite similar to PCR:
+    it also applies a dimensionality reduction to the samples before applying a linear regressor to the transformed data.
+    The main difference with PCR is that the PLS transformation is supervised. Therefore, as we will see in this example,
+    it does not suffer from the issue we just mentioned.
+    """)
+    gr.Markdown("You can see the associated scikit-learn example [here](https://scikit-learn.org/stable/auto_examples/cross_decomposition/plot_pcr_vs_pls.html#sphx-glr-auto-examples-cross-decomposition-plot-pcr-vs-pls-py).")
+    # loaded_model = load_hf_model_hub()
+    with gr.Tab("Visualize Input dataset"):
+        with gr.Row(equal_height=True):
+          slider1 = gr.Slider(label="alpha", minimum=0.0, maximum=1.0)
+          slider1.change(plot_scatter_pca, slider1, outputs= gr.Plot(label='Visualizing input dataset') )
+    with gr.Tab("PCA data projections"):
+        btn_decision = gr.Button(value="PCA data projections")
+        btn_decision.click(data_projections, outputs= gr.Plot(label='PCA data projections') )
+    with gr.Tab("predictive power"):
+        btn_power = gr.Button(value="Predictive power")
+        btn_power.click(plot_pca_ls, outputs= gr.Plot(label='Predictive power') )
+    with gr.Tab("Results tab"):
+        gr.Markdown(
+                """
+                As a final remark,
+                we note that PCR with 2 components performs as well as PLS: this is because in this case,
+                PCR was able to leverage the second component which has the most preditive power on the target.
+            """)
+        btn_power = gr.Button(value="Results")
+        out = gr.Textbox(label="r2 score of both estimators")
+        btn_power.click(print_results, outputs= out )
+    with gr.Tab("r2_score of predictors comparison"):
+      with gr.Row(equal_height=True):
+        gr.Markdown(
+                """
+              We also print the R-squared scores of both estimators, which further confirms that PLS is a better alternative than PCR in this case.
+              A negative R-squared indicates that PCR performs worse than a regressor that would simply predict the mean of the target.
+            """)
+        btn_1 = gr.Button(value="r2_score of predictors")
+        out1 = gr.Textbox(label="r2_score of predictors")
+        btn_1.click(calc_pcr_r2, outputs= out1 )
+    gr.Markdown( f"## End of page")
+demo.launch()