from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
pca = PCA()
lr = LinearRegression()
# make a pipeline that chains together the pca and the linear regression
# this means that when the X data gets "piped in" it first hits the PCA,
# which will fit it to the data, then transform the original variables
# into their principal component "new variables".
# From here these principal components get sent into the linear regression
# to be fit.
# This is very useful because we can actually gridsearch the number of
# components - gridsearch is designed to run on models that return
# some kind of score. The PCA has no score, it's just transforming the
# variables. So gridsearching the PCA itself is meaningless.
# However, if it then goes to a linear regression after, we can score
# it on some dependent variable in order to determine which number
# of components was the best one!
pca_pipe = make_pipeline(pca, lr)
pca_grid = {
'pca__n_components':[1,2,3,4,5]
}
from sklearn.model_selection import GridSearchCV
# Gridsearch now takes the pipeline:
pca_gs = GridSearchCV(pca_pipe, pca_grid, cv=10)
# fit it on the data X, y as usual
pca_gs.fit(subjective.values, reading)
print(pca_gs.best_params_)
print(pca_gs.best_score_)