Hierarchical Clustering Algorithm.
"""
HIERARCHICAL CLUSTERING
input:
- DF: pandas df (rows = data to be classified, columns = features to be used to calculate distances)
- lvar: variables to be used as features
- isplot: plot or not results
source:
Scipy Library: https://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html
Manual: https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/
"""
def hierarchical_clustering(DF,lvar,isplot=False):
import numpy as np
import copy
print('Hierarchical Clustering of: %s'%lvar)
# prepare data
X= DF[lvar].as_matrix()
## Hierarchical Clustering
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
# generate the linkage matrix
Z = linkage(X, 'ward')
## DENDOGRAM
def fancy_dendrogram(*args, **kwargs):
max_d = kwargs.pop('max_d', None)
if max_d and 'color_threshold' not in kwargs:
kwargs['color_threshold'] = max_d
annotate_above = kwargs.pop('annotate_above', 0)
ddata = dendrogram(*args, **kwargs)
if not kwargs.get('no_plot', False):
plt.title('Hierarchical Clustering Dendrogram (truncated)')
plt.xlabel('sample index or (cluster size)')
plt.ylabel('distance')
for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
x = 0.5 * sum(i[1:3])
y = d[1]
if y > annotate_above:
plt.plot(x, y, 'o', c=c)
plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
textcoords='offset points',
va='top', ha='center')
if max_d:
plt.axhline(y=max_d, c='k')
return ddata
if isplot:
fancy_dendrogram(
Z,
truncate_mode='lastp',
p=20,
leaf_rotation=90.,
leaf_font_size=12.,
show_contracted=True,
annotate_above=10, # useful in small plots so annotations don't overlap
)
plt.show()
## Elbow Method for estimate the most optimal number of clusters
last = Z[-10:, 2]
last_rev = last[::-1]
if isplot:
idxs = np.arange(1, len(last) + 1)
plt.plot(idxs, last_rev)
acceleration = np.diff(last, 2) # 2nd derivative of the distances
acceleration_rev = acceleration[::-1]
if isplot:
plt.plot(idxs[:-2] + 1, acceleration_rev)
plt.show()
k = acceleration_rev.argmax() + 2 # if idx 0 is the max of this we want 2 clusters
print("clusters = %s"%(k))
## CALCULATE CLUSTERS INDEX
from scipy.cluster.hierarchy import fcluster
clusters = fcluster(Z, k, criterion='maxclust')
## PLOT CLUSTERS
if isplot:
plt.figure(figsize=(6, 6))
plt.scatter(X[:,0], X[:,1], c=clusters, cmap='prism') # plot points with cluster dependent colors
plt.show()
# clean
del(X)
# return
return clusters
## calculate distances for Hierarchical Clustering Forecast
def cdist_sparse( X, Y, **kwargs ):
""" -> |X| x |Y| cdist array, any cdist metric
X or Y may be sparse -- best csr
"""
from scipy.sparse import issparse # $scipy/sparse/csr.py
from scipy.spatial.distance import cdist # $scipy/spatial/distance.py
import numpy as np
# todense row at a time, v slow if both v sparse
sxy = 2*issparse(X) + issparse(Y)
if sxy == 0:
return cdist( X, Y, **kwargs )
d = np.empty( (X.shape[0], Y.shape[0]), np.float64 )
if sxy == 2:
for j, x in enumerate(X):
d[j] = cdist( x.todense(), Y, **kwargs ) [0]
elif sxy == 1:
for k, y in enumerate(Y):
d[:,k] = cdist( X, y.todense(), **kwargs ) [0]
else:
for j, x in enumerate(X):
for k, y in enumerate(Y):
d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0]
return d