alexanderholt
11/15/2017 - 8:52 PM

PCA Linear Regression Pipeline

from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

pca = PCA()
lr = LinearRegression()

# make a pipeline that chains together the pca and the linear regression
# this means that when the X data gets "piped in" it first hits the PCA,
# which will fit it to the data, then transform the original variables
# into their principal component "new variables".

# From here these principal components get sent into the linear regression
# to be fit.

# This is very useful because we can actually gridsearch the number of 
# components - gridsearch is designed to run on models that return
# some kind of score. The PCA has no score, it's just transforming the
# variables. So gridsearching the PCA itself is meaningless.

# However, if it then goes to a linear regression after, we can score
# it on some dependent variable in order to determine which number
# of components was the best one!

pca_pipe = make_pipeline(pca, lr)

pca_grid = {
    'pca__n_components':[1,2,3,4,5]
}

from sklearn.model_selection import GridSearchCV

# Gridsearch now takes the pipeline:
pca_gs = GridSearchCV(pca_pipe, pca_grid, cv=10)

# fit it on the data X, y as usual
pca_gs.fit(subjective.values, reading)

print(pca_gs.best_params_)
print(pca_gs.best_score_)