jmquintana79
7/11/2017 - 4:01 AM

## Principal Components Analysis: plot explained variance per components to decide the most optimal number of components and the transformation

Principal Components Analysis (PCA): plot explained variance per components to decide the most optimal number of components and the transformation.

``````""" PCA """

## PCA: transformation and plot variance vs principal components
def pca_explained_variance(FEATURES):
print('PCA of %s'%list(FEATURES.columns))
## build algorithm
from sklearn import decomposition
pca = decomposition.PCA()
X = FEATURES.as_matrix()
pca.fit(X)

## plot explained variance per componenents
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(6,4))
y = 100*pca.explained_variance_ratio_
x = list(range(1,len(y)+1,1))
plt.plot(x,y, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_(%)')
for i, txt in enumerate(['%.2f%s'%(i,'%') for i in y]):
ax.annotate(txt, (x[i]-0.1,y[i]+2.5))
plt.show()

# return
return None

## PCA: transformation and extraction of principal of components
def PCA_transformation(FEATURES,n_selection=None):
import pandas as pd

## build algorithm
from sklearn import decomposition
pca = decomposition.PCA()
X = FEATURES.as_matrix()
pca.fit(X)

## feature extraction
if n_selection is None: Xtransform = pca.transform(X)
else: Xtransform = pca.transform(X)[:,:n_selection]

# into DF
PC = pd.DataFrame(Xtransform)
lcol_pc = ['pc%s'%i for i in range(1,n_selection+1,1)]
PC.columns = lcol_pc

# return
return [PC,pca]
``````