jmquintana79
7/11/2017 - 4:01 AM

Principal Components Analysis: plot explained variance per components to decide the most optimal number of components and the transformation

Principal Components Analysis (PCA): plot explained variance per components to decide the most optimal number of components and the transformation.

""" PCA """

## PCA: transformation and plot variance vs principal components
def pca_explained_variance(FEATURES):
    print('PCA of %s'%list(FEATURES.columns))
    ## build algorithm
    from sklearn import decomposition
    pca = decomposition.PCA()
    X = FEATURES.as_matrix()
    pca.fit(X)

    ## plot explained variance per componenents
    import matplotlib.pyplot as plt
    fig, ax = plt.subplots(figsize=(6,4))
    y = 100*pca.explained_variance_ratio_
    x = list(range(1,len(y)+1,1))
    plt.plot(x,y, linewidth=2)
    plt.axis('tight')
    plt.xlabel('n_components')
    plt.ylabel('explained_variance_(%)')
    for i, txt in enumerate(['%.2f%s'%(i,'%') for i in y]):
        ax.annotate(txt, (x[i]-0.1,y[i]+2.5))
    plt.show()
    
    # return
    return None

## PCA: transformation and extraction of principal of components
def PCA_transformation(FEATURES,n_selection=None):
	import pandas as pd

	## build algorithm
	from sklearn import decomposition
	pca = decomposition.PCA()
	X = FEATURES.as_matrix()
	pca.fit(X)

	## feature extraction
	if n_selection is None: Xtransform = pca.transform(X)
	else: Xtransform = pca.transform(X)[:,:n_selection] 

	# into DF
	PC = pd.DataFrame(Xtransform)
	lcol_pc = ['pc%s'%i for i in range(1,n_selection+1,1)]
	PC.columns = lcol_pc

	# return
	return [PC,pca]